html5parser.py 114 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795
  1. from __future__ import absolute_import, division, unicode_literals
  2. from pip._vendor.six import with_metaclass, viewkeys
  3. import types
  4. from . import _inputstream
  5. from . import _tokenizer
  6. from . import treebuilders
  7. from .treebuilders.base import Marker
  8. from . import _utils
  9. from .constants import (
  10. spaceCharacters, asciiUpper2Lower,
  11. specialElements, headingElements, cdataElements, rcdataElements,
  12. tokenTypes, tagTokenTypes,
  13. namespaces,
  14. htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
  15. adjustForeignAttributes as adjustForeignAttributesMap,
  16. adjustMathMLAttributes, adjustSVGAttributes,
  17. E,
  18. _ReparseException
  19. )
  20. def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
  21. """Parse an HTML document as a string or file-like object into a tree
  22. :arg doc: the document to parse as a string or file-like object
  23. :arg treebuilder: the treebuilder to use when parsing
  24. :arg namespaceHTMLElements: whether or not to namespace HTML elements
  25. :returns: parsed tree
  26. Example:
  27. >>> from html5lib.html5parser import parse
  28. >>> parse('<html><body><p>This is a doc</p></body></html>')
  29. <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
  30. """
  31. tb = treebuilders.getTreeBuilder(treebuilder)
  32. p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
  33. return p.parse(doc, **kwargs)
  34. def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
  35. """Parse an HTML fragment as a string or file-like object into a tree
  36. :arg doc: the fragment to parse as a string or file-like object
  37. :arg container: the container context to parse the fragment in
  38. :arg treebuilder: the treebuilder to use when parsing
  39. :arg namespaceHTMLElements: whether or not to namespace HTML elements
  40. :returns: parsed tree
  41. Example:
  42. >>> from html5lib.html5libparser import parseFragment
  43. >>> parseFragment('<b>this is a fragment</b>')
  44. <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
  45. """
  46. tb = treebuilders.getTreeBuilder(treebuilder)
  47. p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
  48. return p.parseFragment(doc, container=container, **kwargs)
  49. def method_decorator_metaclass(function):
  50. class Decorated(type):
  51. def __new__(meta, classname, bases, classDict):
  52. for attributeName, attribute in classDict.items():
  53. if isinstance(attribute, types.FunctionType):
  54. attribute = function(attribute)
  55. classDict[attributeName] = attribute
  56. return type.__new__(meta, classname, bases, classDict)
  57. return Decorated
  58. class HTMLParser(object):
  59. """HTML parser
  60. Generates a tree structure from a stream of (possibly malformed) HTML.
  61. """
  62. def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
  63. """
  64. :arg tree: a treebuilder class controlling the type of tree that will be
  65. returned. Built in treebuilders can be accessed through
  66. html5lib.treebuilders.getTreeBuilder(treeType)
  67. :arg strict: raise an exception when a parse error is encountered
  68. :arg namespaceHTMLElements: whether or not to namespace HTML elements
  69. :arg debug: whether or not to enable debug mode which logs things
  70. Example:
  71. >>> from html5lib.html5parser import HTMLParser
  72. >>> parser = HTMLParser() # generates parser with etree builder
  73. >>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict
  74. """
  75. # Raise an exception on the first error encountered
  76. self.strict = strict
  77. if tree is None:
  78. tree = treebuilders.getTreeBuilder("etree")
  79. self.tree = tree(namespaceHTMLElements)
  80. self.errors = []
  81. self.phases = {name: cls(self, self.tree) for name, cls in
  82. getPhases(debug).items()}
  83. def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
  84. self.innerHTMLMode = innerHTML
  85. self.container = container
  86. self.scripting = scripting
  87. self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
  88. self.reset()
  89. try:
  90. self.mainLoop()
  91. except _ReparseException:
  92. self.reset()
  93. self.mainLoop()
  94. def reset(self):
  95. self.tree.reset()
  96. self.firstStartTag = False
  97. self.errors = []
  98. self.log = [] # only used with debug mode
  99. # "quirks" / "limited quirks" / "no quirks"
  100. self.compatMode = "no quirks"
  101. if self.innerHTMLMode:
  102. self.innerHTML = self.container.lower()
  103. if self.innerHTML in cdataElements:
  104. self.tokenizer.state = self.tokenizer.rcdataState
  105. elif self.innerHTML in rcdataElements:
  106. self.tokenizer.state = self.tokenizer.rawtextState
  107. elif self.innerHTML == 'plaintext':
  108. self.tokenizer.state = self.tokenizer.plaintextState
  109. else:
  110. # state already is data state
  111. # self.tokenizer.state = self.tokenizer.dataState
  112. pass
  113. self.phase = self.phases["beforeHtml"]
  114. self.phase.insertHtmlElement()
  115. self.resetInsertionMode()
  116. else:
  117. self.innerHTML = False # pylint:disable=redefined-variable-type
  118. self.phase = self.phases["initial"]
  119. self.lastPhase = None
  120. self.beforeRCDataPhase = None
  121. self.framesetOK = True
  122. @property
  123. def documentEncoding(self):
  124. """Name of the character encoding that was used to decode the input stream, or
  125. :obj:`None` if that is not determined yet
  126. """
  127. if not hasattr(self, 'tokenizer'):
  128. return None
  129. return self.tokenizer.stream.charEncoding[0].name
  130. def isHTMLIntegrationPoint(self, element):
  131. if (element.name == "annotation-xml" and
  132. element.namespace == namespaces["mathml"]):
  133. return ("encoding" in element.attributes and
  134. element.attributes["encoding"].translate(
  135. asciiUpper2Lower) in
  136. ("text/html", "application/xhtml+xml"))
  137. else:
  138. return (element.namespace, element.name) in htmlIntegrationPointElements
  139. def isMathMLTextIntegrationPoint(self, element):
  140. return (element.namespace, element.name) in mathmlTextIntegrationPointElements
  141. def mainLoop(self):
  142. CharactersToken = tokenTypes["Characters"]
  143. SpaceCharactersToken = tokenTypes["SpaceCharacters"]
  144. StartTagToken = tokenTypes["StartTag"]
  145. EndTagToken = tokenTypes["EndTag"]
  146. CommentToken = tokenTypes["Comment"]
  147. DoctypeToken = tokenTypes["Doctype"]
  148. ParseErrorToken = tokenTypes["ParseError"]
  149. for token in self.tokenizer:
  150. prev_token = None
  151. new_token = token
  152. while new_token is not None:
  153. prev_token = new_token
  154. currentNode = self.tree.openElements[-1] if self.tree.openElements else None
  155. currentNodeNamespace = currentNode.namespace if currentNode else None
  156. currentNodeName = currentNode.name if currentNode else None
  157. type = new_token["type"]
  158. if type == ParseErrorToken:
  159. self.parseError(new_token["data"], new_token.get("datavars", {}))
  160. new_token = None
  161. else:
  162. if (len(self.tree.openElements) == 0 or
  163. currentNodeNamespace == self.tree.defaultNamespace or
  164. (self.isMathMLTextIntegrationPoint(currentNode) and
  165. ((type == StartTagToken and
  166. token["name"] not in frozenset(["mglyph", "malignmark"])) or
  167. type in (CharactersToken, SpaceCharactersToken))) or
  168. (currentNodeNamespace == namespaces["mathml"] and
  169. currentNodeName == "annotation-xml" and
  170. type == StartTagToken and
  171. token["name"] == "svg") or
  172. (self.isHTMLIntegrationPoint(currentNode) and
  173. type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
  174. phase = self.phase
  175. else:
  176. phase = self.phases["inForeignContent"]
  177. if type == CharactersToken:
  178. new_token = phase.processCharacters(new_token)
  179. elif type == SpaceCharactersToken:
  180. new_token = phase.processSpaceCharacters(new_token)
  181. elif type == StartTagToken:
  182. new_token = phase.processStartTag(new_token)
  183. elif type == EndTagToken:
  184. new_token = phase.processEndTag(new_token)
  185. elif type == CommentToken:
  186. new_token = phase.processComment(new_token)
  187. elif type == DoctypeToken:
  188. new_token = phase.processDoctype(new_token)
  189. if (type == StartTagToken and prev_token["selfClosing"] and
  190. not prev_token["selfClosingAcknowledged"]):
  191. self.parseError("non-void-element-with-trailing-solidus",
  192. {"name": prev_token["name"]})
  193. # When the loop finishes it's EOF
  194. reprocess = True
  195. phases = []
  196. while reprocess:
  197. phases.append(self.phase)
  198. reprocess = self.phase.processEOF()
  199. if reprocess:
  200. assert self.phase not in phases
  201. def parse(self, stream, *args, **kwargs):
  202. """Parse a HTML document into a well-formed tree
  203. :arg stream: a file-like object or string containing the HTML to be parsed
  204. The optional encoding parameter must be a string that indicates
  205. the encoding. If specified, that encoding will be used,
  206. regardless of any BOM or later declaration (such as in a meta
  207. element).
  208. :arg scripting: treat noscript elements as if JavaScript was turned on
  209. :returns: parsed tree
  210. Example:
  211. >>> from html5lib.html5parser import HTMLParser
  212. >>> parser = HTMLParser()
  213. >>> parser.parse('<html><body><p>This is a doc</p></body></html>')
  214. <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
  215. """
  216. self._parse(stream, False, None, *args, **kwargs)
  217. return self.tree.getDocument()
  218. def parseFragment(self, stream, *args, **kwargs):
  219. """Parse a HTML fragment into a well-formed tree fragment
  220. :arg container: name of the element we're setting the innerHTML
  221. property if set to None, default to 'div'
  222. :arg stream: a file-like object or string containing the HTML to be parsed
  223. The optional encoding parameter must be a string that indicates
  224. the encoding. If specified, that encoding will be used,
  225. regardless of any BOM or later declaration (such as in a meta
  226. element)
  227. :arg scripting: treat noscript elements as if JavaScript was turned on
  228. :returns: parsed tree
  229. Example:
  230. >>> from html5lib.html5libparser import HTMLParser
  231. >>> parser = HTMLParser()
  232. >>> parser.parseFragment('<b>this is a fragment</b>')
  233. <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
  234. """
  235. self._parse(stream, True, *args, **kwargs)
  236. return self.tree.getFragment()
  237. def parseError(self, errorcode="XXX-undefined-error", datavars=None):
  238. # XXX The idea is to make errorcode mandatory.
  239. if datavars is None:
  240. datavars = {}
  241. self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
  242. if self.strict:
  243. raise ParseError(E[errorcode] % datavars)
  244. def adjustMathMLAttributes(self, token):
  245. adjust_attributes(token, adjustMathMLAttributes)
  246. def adjustSVGAttributes(self, token):
  247. adjust_attributes(token, adjustSVGAttributes)
  248. def adjustForeignAttributes(self, token):
  249. adjust_attributes(token, adjustForeignAttributesMap)
  250. def reparseTokenNormal(self, token):
  251. # pylint:disable=unused-argument
  252. self.parser.phase()
  253. def resetInsertionMode(self):
  254. # The name of this method is mostly historical. (It's also used in the
  255. # specification.)
  256. last = False
  257. newModes = {
  258. "select": "inSelect",
  259. "td": "inCell",
  260. "th": "inCell",
  261. "tr": "inRow",
  262. "tbody": "inTableBody",
  263. "thead": "inTableBody",
  264. "tfoot": "inTableBody",
  265. "caption": "inCaption",
  266. "colgroup": "inColumnGroup",
  267. "table": "inTable",
  268. "head": "inBody",
  269. "body": "inBody",
  270. "frameset": "inFrameset",
  271. "html": "beforeHead"
  272. }
  273. for node in self.tree.openElements[::-1]:
  274. nodeName = node.name
  275. new_phase = None
  276. if node == self.tree.openElements[0]:
  277. assert self.innerHTML
  278. last = True
  279. nodeName = self.innerHTML
  280. # Check for conditions that should only happen in the innerHTML
  281. # case
  282. if nodeName in ("select", "colgroup", "head", "html"):
  283. assert self.innerHTML
  284. if not last and node.namespace != self.tree.defaultNamespace:
  285. continue
  286. if nodeName in newModes:
  287. new_phase = self.phases[newModes[nodeName]]
  288. break
  289. elif last:
  290. new_phase = self.phases["inBody"]
  291. break
  292. self.phase = new_phase
  293. def parseRCDataRawtext(self, token, contentType):
  294. # Generic RCDATA/RAWTEXT Parsing algorithm
  295. assert contentType in ("RAWTEXT", "RCDATA")
  296. self.tree.insertElement(token)
  297. if contentType == "RAWTEXT":
  298. self.tokenizer.state = self.tokenizer.rawtextState
  299. else:
  300. self.tokenizer.state = self.tokenizer.rcdataState
  301. self.originalPhase = self.phase
  302. self.phase = self.phases["text"]
  303. @_utils.memoize
  304. def getPhases(debug):
  305. def log(function):
  306. """Logger that records which phase processes each token"""
  307. type_names = {value: key for key, value in tokenTypes.items()}
  308. def wrapped(self, *args, **kwargs):
  309. if function.__name__.startswith("process") and len(args) > 0:
  310. token = args[0]
  311. info = {"type": type_names[token['type']]}
  312. if token['type'] in tagTokenTypes:
  313. info["name"] = token['name']
  314. self.parser.log.append((self.parser.tokenizer.state.__name__,
  315. self.parser.phase.__class__.__name__,
  316. self.__class__.__name__,
  317. function.__name__,
  318. info))
  319. return function(self, *args, **kwargs)
  320. else:
  321. return function(self, *args, **kwargs)
  322. return wrapped
  323. def getMetaclass(use_metaclass, metaclass_func):
  324. if use_metaclass:
  325. return method_decorator_metaclass(metaclass_func)
  326. else:
  327. return type
  328. # pylint:disable=unused-argument
  329. class Phase(with_metaclass(getMetaclass(debug, log))):
  330. """Base class for helper object that implements each phase of processing
  331. """
  332. __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache")
  333. def __init__(self, parser, tree):
  334. self.parser = parser
  335. self.tree = tree
  336. self.__startTagCache = {}
  337. self.__endTagCache = {}
  338. def processEOF(self):
  339. raise NotImplementedError
  340. def processComment(self, token):
  341. # For most phases the following is correct. Where it's not it will be
  342. # overridden.
  343. self.tree.insertComment(token, self.tree.openElements[-1])
  344. def processDoctype(self, token):
  345. self.parser.parseError("unexpected-doctype")
  346. def processCharacters(self, token):
  347. self.tree.insertText(token["data"])
  348. def processSpaceCharacters(self, token):
  349. self.tree.insertText(token["data"])
  350. def processStartTag(self, token):
  351. # Note the caching is done here rather than BoundMethodDispatcher as doing it there
  352. # requires a circular reference to the Phase, and this ends up with a significant
  353. # (CPython 2.7, 3.8) GC cost when parsing many short inputs
  354. name = token["name"]
  355. # In Py2, using `in` is quicker in general than try/except KeyError
  356. # In Py3, `in` is quicker when there are few cache hits (typically short inputs)
  357. if name in self.__startTagCache:
  358. func = self.__startTagCache[name]
  359. else:
  360. func = self.__startTagCache[name] = self.startTagHandler[name]
  361. # bound the cache size in case we get loads of unknown tags
  362. while len(self.__startTagCache) > len(self.startTagHandler) * 1.1:
  363. # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7
  364. self.__startTagCache.pop(next(iter(self.__startTagCache)))
  365. return func(token)
  366. def startTagHtml(self, token):
  367. if not self.parser.firstStartTag and token["name"] == "html":
  368. self.parser.parseError("non-html-root")
  369. # XXX Need a check here to see if the first start tag token emitted is
  370. # this token... If it's not, invoke self.parser.parseError().
  371. for attr, value in token["data"].items():
  372. if attr not in self.tree.openElements[0].attributes:
  373. self.tree.openElements[0].attributes[attr] = value
  374. self.parser.firstStartTag = False
  375. def processEndTag(self, token):
  376. # Note the caching is done here rather than BoundMethodDispatcher as doing it there
  377. # requires a circular reference to the Phase, and this ends up with a significant
  378. # (CPython 2.7, 3.8) GC cost when parsing many short inputs
  379. name = token["name"]
  380. # In Py2, using `in` is quicker in general than try/except KeyError
  381. # In Py3, `in` is quicker when there are few cache hits (typically short inputs)
  382. if name in self.__endTagCache:
  383. func = self.__endTagCache[name]
  384. else:
  385. func = self.__endTagCache[name] = self.endTagHandler[name]
  386. # bound the cache size in case we get loads of unknown tags
  387. while len(self.__endTagCache) > len(self.endTagHandler) * 1.1:
  388. # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7
  389. self.__endTagCache.pop(next(iter(self.__endTagCache)))
  390. return func(token)
  391. class InitialPhase(Phase):
  392. __slots__ = tuple()
  393. def processSpaceCharacters(self, token):
  394. pass
  395. def processComment(self, token):
  396. self.tree.insertComment(token, self.tree.document)
  397. def processDoctype(self, token):
  398. name = token["name"]
  399. publicId = token["publicId"]
  400. systemId = token["systemId"]
  401. correct = token["correct"]
  402. if (name != "html" or publicId is not None or
  403. systemId is not None and systemId != "about:legacy-compat"):
  404. self.parser.parseError("unknown-doctype")
  405. if publicId is None:
  406. publicId = ""
  407. self.tree.insertDoctype(token)
  408. if publicId != "":
  409. publicId = publicId.translate(asciiUpper2Lower)
  410. if (not correct or token["name"] != "html" or
  411. publicId.startswith(
  412. ("+//silmaril//dtd html pro v0r11 19970101//",
  413. "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
  414. "-//as//dtd html 3.0 aswedit + extensions//",
  415. "-//ietf//dtd html 2.0 level 1//",
  416. "-//ietf//dtd html 2.0 level 2//",
  417. "-//ietf//dtd html 2.0 strict level 1//",
  418. "-//ietf//dtd html 2.0 strict level 2//",
  419. "-//ietf//dtd html 2.0 strict//",
  420. "-//ietf//dtd html 2.0//",
  421. "-//ietf//dtd html 2.1e//",
  422. "-//ietf//dtd html 3.0//",
  423. "-//ietf//dtd html 3.2 final//",
  424. "-//ietf//dtd html 3.2//",
  425. "-//ietf//dtd html 3//",
  426. "-//ietf//dtd html level 0//",
  427. "-//ietf//dtd html level 1//",
  428. "-//ietf//dtd html level 2//",
  429. "-//ietf//dtd html level 3//",
  430. "-//ietf//dtd html strict level 0//",
  431. "-//ietf//dtd html strict level 1//",
  432. "-//ietf//dtd html strict level 2//",
  433. "-//ietf//dtd html strict level 3//",
  434. "-//ietf//dtd html strict//",
  435. "-//ietf//dtd html//",
  436. "-//metrius//dtd metrius presentational//",
  437. "-//microsoft//dtd internet explorer 2.0 html strict//",
  438. "-//microsoft//dtd internet explorer 2.0 html//",
  439. "-//microsoft//dtd internet explorer 2.0 tables//",
  440. "-//microsoft//dtd internet explorer 3.0 html strict//",
  441. "-//microsoft//dtd internet explorer 3.0 html//",
  442. "-//microsoft//dtd internet explorer 3.0 tables//",
  443. "-//netscape comm. corp.//dtd html//",
  444. "-//netscape comm. corp.//dtd strict html//",
  445. "-//o'reilly and associates//dtd html 2.0//",
  446. "-//o'reilly and associates//dtd html extended 1.0//",
  447. "-//o'reilly and associates//dtd html extended relaxed 1.0//",
  448. "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
  449. "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
  450. "-//spyglass//dtd html 2.0 extended//",
  451. "-//sq//dtd html 2.0 hotmetal + extensions//",
  452. "-//sun microsystems corp.//dtd hotjava html//",
  453. "-//sun microsystems corp.//dtd hotjava strict html//",
  454. "-//w3c//dtd html 3 1995-03-24//",
  455. "-//w3c//dtd html 3.2 draft//",
  456. "-//w3c//dtd html 3.2 final//",
  457. "-//w3c//dtd html 3.2//",
  458. "-//w3c//dtd html 3.2s draft//",
  459. "-//w3c//dtd html 4.0 frameset//",
  460. "-//w3c//dtd html 4.0 transitional//",
  461. "-//w3c//dtd html experimental 19960712//",
  462. "-//w3c//dtd html experimental 970421//",
  463. "-//w3c//dtd w3 html//",
  464. "-//w3o//dtd w3 html 3.0//",
  465. "-//webtechs//dtd mozilla html 2.0//",
  466. "-//webtechs//dtd mozilla html//")) or
  467. publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
  468. "-/w3c/dtd html 4.0 transitional/en",
  469. "html") or
  470. publicId.startswith(
  471. ("-//w3c//dtd html 4.01 frameset//",
  472. "-//w3c//dtd html 4.01 transitional//")) and
  473. systemId is None or
  474. systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
  475. self.parser.compatMode = "quirks"
  476. elif (publicId.startswith(
  477. ("-//w3c//dtd xhtml 1.0 frameset//",
  478. "-//w3c//dtd xhtml 1.0 transitional//")) or
  479. publicId.startswith(
  480. ("-//w3c//dtd html 4.01 frameset//",
  481. "-//w3c//dtd html 4.01 transitional//")) and
  482. systemId is not None):
  483. self.parser.compatMode = "limited quirks"
  484. self.parser.phase = self.parser.phases["beforeHtml"]
  485. def anythingElse(self):
  486. self.parser.compatMode = "quirks"
  487. self.parser.phase = self.parser.phases["beforeHtml"]
  488. def processCharacters(self, token):
  489. self.parser.parseError("expected-doctype-but-got-chars")
  490. self.anythingElse()
  491. return token
  492. def processStartTag(self, token):
  493. self.parser.parseError("expected-doctype-but-got-start-tag",
  494. {"name": token["name"]})
  495. self.anythingElse()
  496. return token
  497. def processEndTag(self, token):
  498. self.parser.parseError("expected-doctype-but-got-end-tag",
  499. {"name": token["name"]})
  500. self.anythingElse()
  501. return token
  502. def processEOF(self):
  503. self.parser.parseError("expected-doctype-but-got-eof")
  504. self.anythingElse()
  505. return True
  506. class BeforeHtmlPhase(Phase):
  507. __slots__ = tuple()
  508. # helper methods
  509. def insertHtmlElement(self):
  510. self.tree.insertRoot(impliedTagToken("html", "StartTag"))
  511. self.parser.phase = self.parser.phases["beforeHead"]
  512. # other
  513. def processEOF(self):
  514. self.insertHtmlElement()
  515. return True
  516. def processComment(self, token):
  517. self.tree.insertComment(token, self.tree.document)
  518. def processSpaceCharacters(self, token):
  519. pass
  520. def processCharacters(self, token):
  521. self.insertHtmlElement()
  522. return token
  523. def processStartTag(self, token):
  524. if token["name"] == "html":
  525. self.parser.firstStartTag = True
  526. self.insertHtmlElement()
  527. return token
  528. def processEndTag(self, token):
  529. if token["name"] not in ("head", "body", "html", "br"):
  530. self.parser.parseError("unexpected-end-tag-before-html",
  531. {"name": token["name"]})
  532. else:
  533. self.insertHtmlElement()
  534. return token
  535. class BeforeHeadPhase(Phase):
  536. __slots__ = tuple()
  537. def processEOF(self):
  538. self.startTagHead(impliedTagToken("head", "StartTag"))
  539. return True
  540. def processSpaceCharacters(self, token):
  541. pass
  542. def processCharacters(self, token):
  543. self.startTagHead(impliedTagToken("head", "StartTag"))
  544. return token
  545. def startTagHtml(self, token):
  546. return self.parser.phases["inBody"].processStartTag(token)
  547. def startTagHead(self, token):
  548. self.tree.insertElement(token)
  549. self.tree.headPointer = self.tree.openElements[-1]
  550. self.parser.phase = self.parser.phases["inHead"]
  551. def startTagOther(self, token):
  552. self.startTagHead(impliedTagToken("head", "StartTag"))
  553. return token
  554. def endTagImplyHead(self, token):
  555. self.startTagHead(impliedTagToken("head", "StartTag"))
  556. return token
  557. def endTagOther(self, token):
  558. self.parser.parseError("end-tag-after-implied-root",
  559. {"name": token["name"]})
  560. startTagHandler = _utils.MethodDispatcher([
  561. ("html", startTagHtml),
  562. ("head", startTagHead)
  563. ])
  564. startTagHandler.default = startTagOther
  565. endTagHandler = _utils.MethodDispatcher([
  566. (("head", "body", "html", "br"), endTagImplyHead)
  567. ])
  568. endTagHandler.default = endTagOther
  569. class InHeadPhase(Phase):
  570. __slots__ = tuple()
  571. # the real thing
  572. def processEOF(self):
  573. self.anythingElse()
  574. return True
  575. def processCharacters(self, token):
  576. self.anythingElse()
  577. return token
  578. def startTagHtml(self, token):
  579. return self.parser.phases["inBody"].processStartTag(token)
  580. def startTagHead(self, token):
  581. self.parser.parseError("two-heads-are-not-better-than-one")
  582. def startTagBaseLinkCommand(self, token):
  583. self.tree.insertElement(token)
  584. self.tree.openElements.pop()
  585. token["selfClosingAcknowledged"] = True
  586. def startTagMeta(self, token):
  587. self.tree.insertElement(token)
  588. self.tree.openElements.pop()
  589. token["selfClosingAcknowledged"] = True
  590. attributes = token["data"]
  591. if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
  592. if "charset" in attributes:
  593. self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
  594. elif ("content" in attributes and
  595. "http-equiv" in attributes and
  596. attributes["http-equiv"].lower() == "content-type"):
  597. # Encoding it as UTF-8 here is a hack, as really we should pass
  598. # the abstract Unicode string, and just use the
  599. # ContentAttrParser on that, but using UTF-8 allows all chars
  600. # to be encoded and as a ASCII-superset works.
  601. data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
  602. parser = _inputstream.ContentAttrParser(data)
  603. codec = parser.parse()
  604. self.parser.tokenizer.stream.changeEncoding(codec)
  605. def startTagTitle(self, token):
  606. self.parser.parseRCDataRawtext(token, "RCDATA")
  607. def startTagNoFramesStyle(self, token):
  608. # Need to decide whether to implement the scripting-disabled case
  609. self.parser.parseRCDataRawtext(token, "RAWTEXT")
  610. def startTagNoscript(self, token):
  611. if self.parser.scripting:
  612. self.parser.parseRCDataRawtext(token, "RAWTEXT")
  613. else:
  614. self.tree.insertElement(token)
  615. self.parser.phase = self.parser.phases["inHeadNoscript"]
  616. def startTagScript(self, token):
  617. self.tree.insertElement(token)
  618. self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
  619. self.parser.originalPhase = self.parser.phase
  620. self.parser.phase = self.parser.phases["text"]
  621. def startTagOther(self, token):
  622. self.anythingElse()
  623. return token
  624. def endTagHead(self, token):
  625. node = self.parser.tree.openElements.pop()
  626. assert node.name == "head", "Expected head got %s" % node.name
  627. self.parser.phase = self.parser.phases["afterHead"]
  628. def endTagHtmlBodyBr(self, token):
  629. self.anythingElse()
  630. return token
  631. def endTagOther(self, token):
  632. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  633. def anythingElse(self):
  634. self.endTagHead(impliedTagToken("head"))
  635. startTagHandler = _utils.MethodDispatcher([
  636. ("html", startTagHtml),
  637. ("title", startTagTitle),
  638. (("noframes", "style"), startTagNoFramesStyle),
  639. ("noscript", startTagNoscript),
  640. ("script", startTagScript),
  641. (("base", "basefont", "bgsound", "command", "link"),
  642. startTagBaseLinkCommand),
  643. ("meta", startTagMeta),
  644. ("head", startTagHead)
  645. ])
  646. startTagHandler.default = startTagOther
  647. endTagHandler = _utils.MethodDispatcher([
  648. ("head", endTagHead),
  649. (("br", "html", "body"), endTagHtmlBodyBr)
  650. ])
  651. endTagHandler.default = endTagOther
  652. class InHeadNoscriptPhase(Phase):
  653. __slots__ = tuple()
  654. def processEOF(self):
  655. self.parser.parseError("eof-in-head-noscript")
  656. self.anythingElse()
  657. return True
  658. def processComment(self, token):
  659. return self.parser.phases["inHead"].processComment(token)
  660. def processCharacters(self, token):
  661. self.parser.parseError("char-in-head-noscript")
  662. self.anythingElse()
  663. return token
  664. def processSpaceCharacters(self, token):
  665. return self.parser.phases["inHead"].processSpaceCharacters(token)
  666. def startTagHtml(self, token):
  667. return self.parser.phases["inBody"].processStartTag(token)
  668. def startTagBaseLinkCommand(self, token):
  669. return self.parser.phases["inHead"].processStartTag(token)
  670. def startTagHeadNoscript(self, token):
  671. self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
  672. def startTagOther(self, token):
  673. self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
  674. self.anythingElse()
  675. return token
  676. def endTagNoscript(self, token):
  677. node = self.parser.tree.openElements.pop()
  678. assert node.name == "noscript", "Expected noscript got %s" % node.name
  679. self.parser.phase = self.parser.phases["inHead"]
  680. def endTagBr(self, token):
  681. self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
  682. self.anythingElse()
  683. return token
  684. def endTagOther(self, token):
  685. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  686. def anythingElse(self):
  687. # Caller must raise parse error first!
  688. self.endTagNoscript(impliedTagToken("noscript"))
  689. startTagHandler = _utils.MethodDispatcher([
  690. ("html", startTagHtml),
  691. (("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand),
  692. (("head", "noscript"), startTagHeadNoscript),
  693. ])
  694. startTagHandler.default = startTagOther
  695. endTagHandler = _utils.MethodDispatcher([
  696. ("noscript", endTagNoscript),
  697. ("br", endTagBr),
  698. ])
  699. endTagHandler.default = endTagOther
  700. class AfterHeadPhase(Phase):
  701. __slots__ = tuple()
  702. def processEOF(self):
  703. self.anythingElse()
  704. return True
  705. def processCharacters(self, token):
  706. self.anythingElse()
  707. return token
  708. def startTagHtml(self, token):
  709. return self.parser.phases["inBody"].processStartTag(token)
  710. def startTagBody(self, token):
  711. self.parser.framesetOK = False
  712. self.tree.insertElement(token)
  713. self.parser.phase = self.parser.phases["inBody"]
  714. def startTagFrameset(self, token):
  715. self.tree.insertElement(token)
  716. self.parser.phase = self.parser.phases["inFrameset"]
  717. def startTagFromHead(self, token):
  718. self.parser.parseError("unexpected-start-tag-out-of-my-head",
  719. {"name": token["name"]})
  720. self.tree.openElements.append(self.tree.headPointer)
  721. self.parser.phases["inHead"].processStartTag(token)
  722. for node in self.tree.openElements[::-1]:
  723. if node.name == "head":
  724. self.tree.openElements.remove(node)
  725. break
  726. def startTagHead(self, token):
  727. self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
  728. def startTagOther(self, token):
  729. self.anythingElse()
  730. return token
  731. def endTagHtmlBodyBr(self, token):
  732. self.anythingElse()
  733. return token
  734. def endTagOther(self, token):
  735. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  736. def anythingElse(self):
  737. self.tree.insertElement(impliedTagToken("body", "StartTag"))
  738. self.parser.phase = self.parser.phases["inBody"]
  739. self.parser.framesetOK = True
  740. startTagHandler = _utils.MethodDispatcher([
  741. ("html", startTagHtml),
  742. ("body", startTagBody),
  743. ("frameset", startTagFrameset),
  744. (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
  745. "style", "title"),
  746. startTagFromHead),
  747. ("head", startTagHead)
  748. ])
  749. startTagHandler.default = startTagOther
  750. endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
  751. endTagHtmlBodyBr)])
  752. endTagHandler.default = endTagOther
  753. class InBodyPhase(Phase):
  754. # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
  755. # the really-really-really-very crazy mode
  756. __slots__ = ("processSpaceCharacters",)
  757. def __init__(self, *args, **kwargs):
  758. super(InBodyPhase, self).__init__(*args, **kwargs)
  759. # Set this to the default handler
  760. self.processSpaceCharacters = self.processSpaceCharactersNonPre
  761. def isMatchingFormattingElement(self, node1, node2):
  762. return (node1.name == node2.name and
  763. node1.namespace == node2.namespace and
  764. node1.attributes == node2.attributes)
  765. # helper
  766. def addFormattingElement(self, token):
  767. self.tree.insertElement(token)
  768. element = self.tree.openElements[-1]
  769. matchingElements = []
  770. for node in self.tree.activeFormattingElements[::-1]:
  771. if node is Marker:
  772. break
  773. elif self.isMatchingFormattingElement(node, element):
  774. matchingElements.append(node)
  775. assert len(matchingElements) <= 3
  776. if len(matchingElements) == 3:
  777. self.tree.activeFormattingElements.remove(matchingElements[-1])
  778. self.tree.activeFormattingElements.append(element)
  779. # the real deal
  780. def processEOF(self):
  781. allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
  782. "tfoot", "th", "thead", "tr", "body",
  783. "html"))
  784. for node in self.tree.openElements[::-1]:
  785. if node.name not in allowed_elements:
  786. self.parser.parseError("expected-closing-tag-but-got-eof")
  787. break
  788. # Stop parsing
  789. def processSpaceCharactersDropNewline(self, token):
  790. # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
  791. # want to drop leading newlines
  792. data = token["data"]
  793. self.processSpaceCharacters = self.processSpaceCharactersNonPre
  794. if (data.startswith("\n") and
  795. self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
  796. not self.tree.openElements[-1].hasContent()):
  797. data = data[1:]
  798. if data:
  799. self.tree.reconstructActiveFormattingElements()
  800. self.tree.insertText(data)
  801. def processCharacters(self, token):
  802. if token["data"] == "\u0000":
  803. # The tokenizer should always emit null on its own
  804. return
  805. self.tree.reconstructActiveFormattingElements()
  806. self.tree.insertText(token["data"])
  807. # This must be bad for performance
  808. if (self.parser.framesetOK and
  809. any([char not in spaceCharacters
  810. for char in token["data"]])):
  811. self.parser.framesetOK = False
  812. def processSpaceCharactersNonPre(self, token):
  813. self.tree.reconstructActiveFormattingElements()
  814. self.tree.insertText(token["data"])
  815. def startTagProcessInHead(self, token):
  816. return self.parser.phases["inHead"].processStartTag(token)
  817. def startTagBody(self, token):
  818. self.parser.parseError("unexpected-start-tag", {"name": "body"})
  819. if (len(self.tree.openElements) == 1 or
  820. self.tree.openElements[1].name != "body"):
  821. assert self.parser.innerHTML
  822. else:
  823. self.parser.framesetOK = False
  824. for attr, value in token["data"].items():
  825. if attr not in self.tree.openElements[1].attributes:
  826. self.tree.openElements[1].attributes[attr] = value
  827. def startTagFrameset(self, token):
  828. self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
  829. if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
  830. assert self.parser.innerHTML
  831. elif not self.parser.framesetOK:
  832. pass
  833. else:
  834. if self.tree.openElements[1].parent:
  835. self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
  836. while self.tree.openElements[-1].name != "html":
  837. self.tree.openElements.pop()
  838. self.tree.insertElement(token)
  839. self.parser.phase = self.parser.phases["inFrameset"]
  840. def startTagCloseP(self, token):
  841. if self.tree.elementInScope("p", variant="button"):
  842. self.endTagP(impliedTagToken("p"))
  843. self.tree.insertElement(token)
  844. def startTagPreListing(self, token):
  845. if self.tree.elementInScope("p", variant="button"):
  846. self.endTagP(impliedTagToken("p"))
  847. self.tree.insertElement(token)
  848. self.parser.framesetOK = False
  849. self.processSpaceCharacters = self.processSpaceCharactersDropNewline
  850. def startTagForm(self, token):
  851. if self.tree.formPointer:
  852. self.parser.parseError("unexpected-start-tag", {"name": "form"})
  853. else:
  854. if self.tree.elementInScope("p", variant="button"):
  855. self.endTagP(impliedTagToken("p"))
  856. self.tree.insertElement(token)
  857. self.tree.formPointer = self.tree.openElements[-1]
  858. def startTagListItem(self, token):
  859. self.parser.framesetOK = False
  860. stopNamesMap = {"li": ["li"],
  861. "dt": ["dt", "dd"],
  862. "dd": ["dt", "dd"]}
  863. stopNames = stopNamesMap[token["name"]]
  864. for node in reversed(self.tree.openElements):
  865. if node.name in stopNames:
  866. self.parser.phase.processEndTag(
  867. impliedTagToken(node.name, "EndTag"))
  868. break
  869. if (node.nameTuple in specialElements and
  870. node.name not in ("address", "div", "p")):
  871. break
  872. if self.tree.elementInScope("p", variant="button"):
  873. self.parser.phase.processEndTag(
  874. impliedTagToken("p", "EndTag"))
  875. self.tree.insertElement(token)
  876. def startTagPlaintext(self, token):
  877. if self.tree.elementInScope("p", variant="button"):
  878. self.endTagP(impliedTagToken("p"))
  879. self.tree.insertElement(token)
  880. self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
  881. def startTagHeading(self, token):
  882. if self.tree.elementInScope("p", variant="button"):
  883. self.endTagP(impliedTagToken("p"))
  884. if self.tree.openElements[-1].name in headingElements:
  885. self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
  886. self.tree.openElements.pop()
  887. self.tree.insertElement(token)
  888. def startTagA(self, token):
  889. afeAElement = self.tree.elementInActiveFormattingElements("a")
  890. if afeAElement:
  891. self.parser.parseError("unexpected-start-tag-implies-end-tag",
  892. {"startName": "a", "endName": "a"})
  893. self.endTagFormatting(impliedTagToken("a"))
  894. if afeAElement in self.tree.openElements:
  895. self.tree.openElements.remove(afeAElement)
  896. if afeAElement in self.tree.activeFormattingElements:
  897. self.tree.activeFormattingElements.remove(afeAElement)
  898. self.tree.reconstructActiveFormattingElements()
  899. self.addFormattingElement(token)
  900. def startTagFormatting(self, token):
  901. self.tree.reconstructActiveFormattingElements()
  902. self.addFormattingElement(token)
  903. def startTagNobr(self, token):
  904. self.tree.reconstructActiveFormattingElements()
  905. if self.tree.elementInScope("nobr"):
  906. self.parser.parseError("unexpected-start-tag-implies-end-tag",
  907. {"startName": "nobr", "endName": "nobr"})
  908. self.processEndTag(impliedTagToken("nobr"))
  909. # XXX Need tests that trigger the following
  910. self.tree.reconstructActiveFormattingElements()
  911. self.addFormattingElement(token)
  912. def startTagButton(self, token):
  913. if self.tree.elementInScope("button"):
  914. self.parser.parseError("unexpected-start-tag-implies-end-tag",
  915. {"startName": "button", "endName": "button"})
  916. self.processEndTag(impliedTagToken("button"))
  917. return token
  918. else:
  919. self.tree.reconstructActiveFormattingElements()
  920. self.tree.insertElement(token)
  921. self.parser.framesetOK = False
  922. def startTagAppletMarqueeObject(self, token):
  923. self.tree.reconstructActiveFormattingElements()
  924. self.tree.insertElement(token)
  925. self.tree.activeFormattingElements.append(Marker)
  926. self.parser.framesetOK = False
  927. def startTagXmp(self, token):
  928. if self.tree.elementInScope("p", variant="button"):
  929. self.endTagP(impliedTagToken("p"))
  930. self.tree.reconstructActiveFormattingElements()
  931. self.parser.framesetOK = False
  932. self.parser.parseRCDataRawtext(token, "RAWTEXT")
  933. def startTagTable(self, token):
  934. if self.parser.compatMode != "quirks":
  935. if self.tree.elementInScope("p", variant="button"):
  936. self.processEndTag(impliedTagToken("p"))
  937. self.tree.insertElement(token)
  938. self.parser.framesetOK = False
  939. self.parser.phase = self.parser.phases["inTable"]
  940. def startTagVoidFormatting(self, token):
  941. self.tree.reconstructActiveFormattingElements()
  942. self.tree.insertElement(token)
  943. self.tree.openElements.pop()
  944. token["selfClosingAcknowledged"] = True
  945. self.parser.framesetOK = False
  946. def startTagInput(self, token):
  947. framesetOK = self.parser.framesetOK
  948. self.startTagVoidFormatting(token)
  949. if ("type" in token["data"] and
  950. token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
  951. # input type=hidden doesn't change framesetOK
  952. self.parser.framesetOK = framesetOK
  953. def startTagParamSource(self, token):
  954. self.tree.insertElement(token)
  955. self.tree.openElements.pop()
  956. token["selfClosingAcknowledged"] = True
  957. def startTagHr(self, token):
  958. if self.tree.elementInScope("p", variant="button"):
  959. self.endTagP(impliedTagToken("p"))
  960. self.tree.insertElement(token)
  961. self.tree.openElements.pop()
  962. token["selfClosingAcknowledged"] = True
  963. self.parser.framesetOK = False
  964. def startTagImage(self, token):
  965. # No really...
  966. self.parser.parseError("unexpected-start-tag-treated-as",
  967. {"originalName": "image", "newName": "img"})
  968. self.processStartTag(impliedTagToken("img", "StartTag",
  969. attributes=token["data"],
  970. selfClosing=token["selfClosing"]))
  971. def startTagIsIndex(self, token):
  972. self.parser.parseError("deprecated-tag", {"name": "isindex"})
  973. if self.tree.formPointer:
  974. return
  975. form_attrs = {}
  976. if "action" in token["data"]:
  977. form_attrs["action"] = token["data"]["action"]
  978. self.processStartTag(impliedTagToken("form", "StartTag",
  979. attributes=form_attrs))
  980. self.processStartTag(impliedTagToken("hr", "StartTag"))
  981. self.processStartTag(impliedTagToken("label", "StartTag"))
  982. # XXX Localization ...
  983. if "prompt" in token["data"]:
  984. prompt = token["data"]["prompt"]
  985. else:
  986. prompt = "This is a searchable index. Enter search keywords: "
  987. self.processCharacters(
  988. {"type": tokenTypes["Characters"], "data": prompt})
  989. attributes = token["data"].copy()
  990. if "action" in attributes:
  991. del attributes["action"]
  992. if "prompt" in attributes:
  993. del attributes["prompt"]
  994. attributes["name"] = "isindex"
  995. self.processStartTag(impliedTagToken("input", "StartTag",
  996. attributes=attributes,
  997. selfClosing=token["selfClosing"]))
  998. self.processEndTag(impliedTagToken("label"))
  999. self.processStartTag(impliedTagToken("hr", "StartTag"))
  1000. self.processEndTag(impliedTagToken("form"))
  1001. def startTagTextarea(self, token):
  1002. self.tree.insertElement(token)
  1003. self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
  1004. self.processSpaceCharacters = self.processSpaceCharactersDropNewline
  1005. self.parser.framesetOK = False
  1006. def startTagIFrame(self, token):
  1007. self.parser.framesetOK = False
  1008. self.startTagRawtext(token)
  1009. def startTagNoscript(self, token):
  1010. if self.parser.scripting:
  1011. self.startTagRawtext(token)
  1012. else:
  1013. self.startTagOther(token)
  1014. def startTagRawtext(self, token):
  1015. """iframe, noembed noframes, noscript(if scripting enabled)"""
  1016. self.parser.parseRCDataRawtext(token, "RAWTEXT")
  1017. def startTagOpt(self, token):
  1018. if self.tree.openElements[-1].name == "option":
  1019. self.parser.phase.processEndTag(impliedTagToken("option"))
  1020. self.tree.reconstructActiveFormattingElements()
  1021. self.parser.tree.insertElement(token)
  1022. def startTagSelect(self, token):
  1023. self.tree.reconstructActiveFormattingElements()
  1024. self.tree.insertElement(token)
  1025. self.parser.framesetOK = False
  1026. if self.parser.phase in (self.parser.phases["inTable"],
  1027. self.parser.phases["inCaption"],
  1028. self.parser.phases["inColumnGroup"],
  1029. self.parser.phases["inTableBody"],
  1030. self.parser.phases["inRow"],
  1031. self.parser.phases["inCell"]):
  1032. self.parser.phase = self.parser.phases["inSelectInTable"]
  1033. else:
  1034. self.parser.phase = self.parser.phases["inSelect"]
  1035. def startTagRpRt(self, token):
  1036. if self.tree.elementInScope("ruby"):
  1037. self.tree.generateImpliedEndTags()
  1038. if self.tree.openElements[-1].name != "ruby":
  1039. self.parser.parseError()
  1040. self.tree.insertElement(token)
  1041. def startTagMath(self, token):
  1042. self.tree.reconstructActiveFormattingElements()
  1043. self.parser.adjustMathMLAttributes(token)
  1044. self.parser.adjustForeignAttributes(token)
  1045. token["namespace"] = namespaces["mathml"]
  1046. self.tree.insertElement(token)
  1047. # Need to get the parse error right for the case where the token
  1048. # has a namespace not equal to the xmlns attribute
  1049. if token["selfClosing"]:
  1050. self.tree.openElements.pop()
  1051. token["selfClosingAcknowledged"] = True
  1052. def startTagSvg(self, token):
  1053. self.tree.reconstructActiveFormattingElements()
  1054. self.parser.adjustSVGAttributes(token)
  1055. self.parser.adjustForeignAttributes(token)
  1056. token["namespace"] = namespaces["svg"]
  1057. self.tree.insertElement(token)
  1058. # Need to get the parse error right for the case where the token
  1059. # has a namespace not equal to the xmlns attribute
  1060. if token["selfClosing"]:
  1061. self.tree.openElements.pop()
  1062. token["selfClosingAcknowledged"] = True
  1063. def startTagMisplaced(self, token):
  1064. """ Elements that should be children of other elements that have a
  1065. different insertion mode; here they are ignored
  1066. "caption", "col", "colgroup", "frame", "frameset", "head",
  1067. "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
  1068. "tr", "noscript"
  1069. """
  1070. self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
  1071. def startTagOther(self, token):
  1072. self.tree.reconstructActiveFormattingElements()
  1073. self.tree.insertElement(token)
  1074. def endTagP(self, token):
  1075. if not self.tree.elementInScope("p", variant="button"):
  1076. self.startTagCloseP(impliedTagToken("p", "StartTag"))
  1077. self.parser.parseError("unexpected-end-tag", {"name": "p"})
  1078. self.endTagP(impliedTagToken("p", "EndTag"))
  1079. else:
  1080. self.tree.generateImpliedEndTags("p")
  1081. if self.tree.openElements[-1].name != "p":
  1082. self.parser.parseError("unexpected-end-tag", {"name": "p"})
  1083. node = self.tree.openElements.pop()
  1084. while node.name != "p":
  1085. node = self.tree.openElements.pop()
  1086. def endTagBody(self, token):
  1087. if not self.tree.elementInScope("body"):
  1088. self.parser.parseError()
  1089. return
  1090. elif self.tree.openElements[-1].name != "body":
  1091. for node in self.tree.openElements[2:]:
  1092. if node.name not in frozenset(("dd", "dt", "li", "optgroup",
  1093. "option", "p", "rp", "rt",
  1094. "tbody", "td", "tfoot",
  1095. "th", "thead", "tr", "body",
  1096. "html")):
  1097. # Not sure this is the correct name for the parse error
  1098. self.parser.parseError(
  1099. "expected-one-end-tag-but-got-another",
  1100. {"gotName": "body", "expectedName": node.name})
  1101. break
  1102. self.parser.phase = self.parser.phases["afterBody"]
  1103. def endTagHtml(self, token):
  1104. # We repeat the test for the body end tag token being ignored here
  1105. if self.tree.elementInScope("body"):
  1106. self.endTagBody(impliedTagToken("body"))
  1107. return token
  1108. def endTagBlock(self, token):
  1109. # Put us back in the right whitespace handling mode
  1110. if token["name"] == "pre":
  1111. self.processSpaceCharacters = self.processSpaceCharactersNonPre
  1112. inScope = self.tree.elementInScope(token["name"])
  1113. if inScope:
  1114. self.tree.generateImpliedEndTags()
  1115. if self.tree.openElements[-1].name != token["name"]:
  1116. self.parser.parseError("end-tag-too-early", {"name": token["name"]})
  1117. if inScope:
  1118. node = self.tree.openElements.pop()
  1119. while node.name != token["name"]:
  1120. node = self.tree.openElements.pop()
  1121. def endTagForm(self, token):
  1122. node = self.tree.formPointer
  1123. self.tree.formPointer = None
  1124. if node is None or not self.tree.elementInScope(node):
  1125. self.parser.parseError("unexpected-end-tag",
  1126. {"name": "form"})
  1127. else:
  1128. self.tree.generateImpliedEndTags()
  1129. if self.tree.openElements[-1] != node:
  1130. self.parser.parseError("end-tag-too-early-ignored",
  1131. {"name": "form"})
  1132. self.tree.openElements.remove(node)
  1133. def endTagListItem(self, token):
  1134. if token["name"] == "li":
  1135. variant = "list"
  1136. else:
  1137. variant = None
  1138. if not self.tree.elementInScope(token["name"], variant=variant):
  1139. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1140. else:
  1141. self.tree.generateImpliedEndTags(exclude=token["name"])
  1142. if self.tree.openElements[-1].name != token["name"]:
  1143. self.parser.parseError(
  1144. "end-tag-too-early",
  1145. {"name": token["name"]})
  1146. node = self.tree.openElements.pop()
  1147. while node.name != token["name"]:
  1148. node = self.tree.openElements.pop()
  1149. def endTagHeading(self, token):
  1150. for item in headingElements:
  1151. if self.tree.elementInScope(item):
  1152. self.tree.generateImpliedEndTags()
  1153. break
  1154. if self.tree.openElements[-1].name != token["name"]:
  1155. self.parser.parseError("end-tag-too-early", {"name": token["name"]})
  1156. for item in headingElements:
  1157. if self.tree.elementInScope(item):
  1158. item = self.tree.openElements.pop()
  1159. while item.name not in headingElements:
  1160. item = self.tree.openElements.pop()
  1161. break
  1162. def endTagFormatting(self, token):
  1163. """The much-feared adoption agency algorithm"""
  1164. # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
  1165. # XXX Better parseError messages appreciated.
  1166. # Step 1
  1167. outerLoopCounter = 0
  1168. # Step 2
  1169. while outerLoopCounter < 8:
  1170. # Step 3
  1171. outerLoopCounter += 1
  1172. # Step 4:
  1173. # Let the formatting element be the last element in
  1174. # the list of active formatting elements that:
  1175. # - is between the end of the list and the last scope
  1176. # marker in the list, if any, or the start of the list
  1177. # otherwise, and
  1178. # - has the same tag name as the token.
  1179. formattingElement = self.tree.elementInActiveFormattingElements(
  1180. token["name"])
  1181. if (not formattingElement or
  1182. (formattingElement in self.tree.openElements and
  1183. not self.tree.elementInScope(formattingElement.name))):
  1184. # If there is no such node, then abort these steps
  1185. # and instead act as described in the "any other
  1186. # end tag" entry below.
  1187. self.endTagOther(token)
  1188. return
  1189. # Otherwise, if there is such a node, but that node is
  1190. # not in the stack of open elements, then this is a
  1191. # parse error; remove the element from the list, and
  1192. # abort these steps.
  1193. elif formattingElement not in self.tree.openElements:
  1194. self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
  1195. self.tree.activeFormattingElements.remove(formattingElement)
  1196. return
  1197. # Otherwise, if there is such a node, and that node is
  1198. # also in the stack of open elements, but the element
  1199. # is not in scope, then this is a parse error; ignore
  1200. # the token, and abort these steps.
  1201. elif not self.tree.elementInScope(formattingElement.name):
  1202. self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})
  1203. return
  1204. # Otherwise, there is a formatting element and that
  1205. # element is in the stack and is in scope. If the
  1206. # element is not the current node, this is a parse
  1207. # error. In any case, proceed with the algorithm as
  1208. # written in the following steps.
  1209. else:
  1210. if formattingElement != self.tree.openElements[-1]:
  1211. self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
  1212. # Step 5:
  1213. # Let the furthest block be the topmost node in the
  1214. # stack of open elements that is lower in the stack
  1215. # than the formatting element, and is an element in
  1216. # the special category. There might not be one.
  1217. afeIndex = self.tree.openElements.index(formattingElement)
  1218. furthestBlock = None
  1219. for element in self.tree.openElements[afeIndex:]:
  1220. if element.nameTuple in specialElements:
  1221. furthestBlock = element
  1222. break
  1223. # Step 6:
  1224. # If there is no furthest block, then the UA must
  1225. # first pop all the nodes from the bottom of the stack
  1226. # of open elements, from the current node up to and
  1227. # including the formatting element, then remove the
  1228. # formatting element from the list of active
  1229. # formatting elements, and finally abort these steps.
  1230. if furthestBlock is None:
  1231. element = self.tree.openElements.pop()
  1232. while element != formattingElement:
  1233. element = self.tree.openElements.pop()
  1234. self.tree.activeFormattingElements.remove(element)
  1235. return
  1236. # Step 7
  1237. commonAncestor = self.tree.openElements[afeIndex - 1]
  1238. # Step 8:
  1239. # The bookmark is supposed to help us identify where to reinsert
  1240. # nodes in step 15. We have to ensure that we reinsert nodes after
  1241. # the node before the active formatting element. Note the bookmark
  1242. # can move in step 9.7
  1243. bookmark = self.tree.activeFormattingElements.index(formattingElement)
  1244. # Step 9
  1245. lastNode = node = furthestBlock
  1246. innerLoopCounter = 0
  1247. index = self.tree.openElements.index(node)
  1248. while innerLoopCounter < 3:
  1249. innerLoopCounter += 1
  1250. # Node is element before node in open elements
  1251. index -= 1
  1252. node = self.tree.openElements[index]
  1253. if node not in self.tree.activeFormattingElements:
  1254. self.tree.openElements.remove(node)
  1255. continue
  1256. # Step 9.6
  1257. if node == formattingElement:
  1258. break
  1259. # Step 9.7
  1260. if lastNode == furthestBlock:
  1261. bookmark = self.tree.activeFormattingElements.index(node) + 1
  1262. # Step 9.8
  1263. clone = node.cloneNode()
  1264. # Replace node with clone
  1265. self.tree.activeFormattingElements[
  1266. self.tree.activeFormattingElements.index(node)] = clone
  1267. self.tree.openElements[
  1268. self.tree.openElements.index(node)] = clone
  1269. node = clone
  1270. # Step 9.9
  1271. # Remove lastNode from its parents, if any
  1272. if lastNode.parent:
  1273. lastNode.parent.removeChild(lastNode)
  1274. node.appendChild(lastNode)
  1275. # Step 9.10
  1276. lastNode = node
  1277. # Step 10
  1278. # Foster parent lastNode if commonAncestor is a
  1279. # table, tbody, tfoot, thead, or tr we need to foster
  1280. # parent the lastNode
  1281. if lastNode.parent:
  1282. lastNode.parent.removeChild(lastNode)
  1283. if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):
  1284. parent, insertBefore = self.tree.getTableMisnestedNodePosition()
  1285. parent.insertBefore(lastNode, insertBefore)
  1286. else:
  1287. commonAncestor.appendChild(lastNode)
  1288. # Step 11
  1289. clone = formattingElement.cloneNode()
  1290. # Step 12
  1291. furthestBlock.reparentChildren(clone)
  1292. # Step 13
  1293. furthestBlock.appendChild(clone)
  1294. # Step 14
  1295. self.tree.activeFormattingElements.remove(formattingElement)
  1296. self.tree.activeFormattingElements.insert(bookmark, clone)
  1297. # Step 15
  1298. self.tree.openElements.remove(formattingElement)
  1299. self.tree.openElements.insert(
  1300. self.tree.openElements.index(furthestBlock) + 1, clone)
  1301. def endTagAppletMarqueeObject(self, token):
  1302. if self.tree.elementInScope(token["name"]):
  1303. self.tree.generateImpliedEndTags()
  1304. if self.tree.openElements[-1].name != token["name"]:
  1305. self.parser.parseError("end-tag-too-early", {"name": token["name"]})
  1306. if self.tree.elementInScope(token["name"]):
  1307. element = self.tree.openElements.pop()
  1308. while element.name != token["name"]:
  1309. element = self.tree.openElements.pop()
  1310. self.tree.clearActiveFormattingElements()
  1311. def endTagBr(self, token):
  1312. self.parser.parseError("unexpected-end-tag-treated-as",
  1313. {"originalName": "br", "newName": "br element"})
  1314. self.tree.reconstructActiveFormattingElements()
  1315. self.tree.insertElement(impliedTagToken("br", "StartTag"))
  1316. self.tree.openElements.pop()
  1317. def endTagOther(self, token):
  1318. for node in self.tree.openElements[::-1]:
  1319. if node.name == token["name"]:
  1320. self.tree.generateImpliedEndTags(exclude=token["name"])
  1321. if self.tree.openElements[-1].name != token["name"]:
  1322. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1323. while self.tree.openElements.pop() != node:
  1324. pass
  1325. break
  1326. else:
  1327. if node.nameTuple in specialElements:
  1328. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1329. break
  1330. startTagHandler = _utils.MethodDispatcher([
  1331. ("html", Phase.startTagHtml),
  1332. (("base", "basefont", "bgsound", "command", "link", "meta",
  1333. "script", "style", "title"),
  1334. startTagProcessInHead),
  1335. ("body", startTagBody),
  1336. ("frameset", startTagFrameset),
  1337. (("address", "article", "aside", "blockquote", "center", "details",
  1338. "dir", "div", "dl", "fieldset", "figcaption", "figure",
  1339. "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
  1340. "section", "summary", "ul"),
  1341. startTagCloseP),
  1342. (headingElements, startTagHeading),
  1343. (("pre", "listing"), startTagPreListing),
  1344. ("form", startTagForm),
  1345. (("li", "dd", "dt"), startTagListItem),
  1346. ("plaintext", startTagPlaintext),
  1347. ("a", startTagA),
  1348. (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
  1349. "strong", "tt", "u"), startTagFormatting),
  1350. ("nobr", startTagNobr),
  1351. ("button", startTagButton),
  1352. (("applet", "marquee", "object"), startTagAppletMarqueeObject),
  1353. ("xmp", startTagXmp),
  1354. ("table", startTagTable),
  1355. (("area", "br", "embed", "img", "keygen", "wbr"),
  1356. startTagVoidFormatting),
  1357. (("param", "source", "track"), startTagParamSource),
  1358. ("input", startTagInput),
  1359. ("hr", startTagHr),
  1360. ("image", startTagImage),
  1361. ("isindex", startTagIsIndex),
  1362. ("textarea", startTagTextarea),
  1363. ("iframe", startTagIFrame),
  1364. ("noscript", startTagNoscript),
  1365. (("noembed", "noframes"), startTagRawtext),
  1366. ("select", startTagSelect),
  1367. (("rp", "rt"), startTagRpRt),
  1368. (("option", "optgroup"), startTagOpt),
  1369. (("math"), startTagMath),
  1370. (("svg"), startTagSvg),
  1371. (("caption", "col", "colgroup", "frame", "head",
  1372. "tbody", "td", "tfoot", "th", "thead",
  1373. "tr"), startTagMisplaced)
  1374. ])
  1375. startTagHandler.default = startTagOther
  1376. endTagHandler = _utils.MethodDispatcher([
  1377. ("body", endTagBody),
  1378. ("html", endTagHtml),
  1379. (("address", "article", "aside", "blockquote", "button", "center",
  1380. "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
  1381. "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
  1382. "section", "summary", "ul"), endTagBlock),
  1383. ("form", endTagForm),
  1384. ("p", endTagP),
  1385. (("dd", "dt", "li"), endTagListItem),
  1386. (headingElements, endTagHeading),
  1387. (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
  1388. "strike", "strong", "tt", "u"), endTagFormatting),
  1389. (("applet", "marquee", "object"), endTagAppletMarqueeObject),
  1390. ("br", endTagBr),
  1391. ])
  1392. endTagHandler.default = endTagOther
  1393. class TextPhase(Phase):
  1394. __slots__ = tuple()
  1395. def processCharacters(self, token):
  1396. self.tree.insertText(token["data"])
  1397. def processEOF(self):
  1398. self.parser.parseError("expected-named-closing-tag-but-got-eof",
  1399. {"name": self.tree.openElements[-1].name})
  1400. self.tree.openElements.pop()
  1401. self.parser.phase = self.parser.originalPhase
  1402. return True
  1403. def startTagOther(self, token):
  1404. assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']
  1405. def endTagScript(self, token):
  1406. node = self.tree.openElements.pop()
  1407. assert node.name == "script"
  1408. self.parser.phase = self.parser.originalPhase
  1409. # The rest of this method is all stuff that only happens if
  1410. # document.write works
  1411. def endTagOther(self, token):
  1412. self.tree.openElements.pop()
  1413. self.parser.phase = self.parser.originalPhase
  1414. startTagHandler = _utils.MethodDispatcher([])
  1415. startTagHandler.default = startTagOther
  1416. endTagHandler = _utils.MethodDispatcher([
  1417. ("script", endTagScript)])
  1418. endTagHandler.default = endTagOther
  1419. class InTablePhase(Phase):
  1420. # http://www.whatwg.org/specs/web-apps/current-work/#in-table
  1421. __slots__ = tuple()
  1422. # helper methods
  1423. def clearStackToTableContext(self):
  1424. # "clear the stack back to a table context"
  1425. while self.tree.openElements[-1].name not in ("table", "html"):
  1426. # self.parser.parseError("unexpected-implied-end-tag-in-table",
  1427. # {"name": self.tree.openElements[-1].name})
  1428. self.tree.openElements.pop()
  1429. # When the current node is <html> it's an innerHTML case
  1430. # processing methods
  1431. def processEOF(self):
  1432. if self.tree.openElements[-1].name != "html":
  1433. self.parser.parseError("eof-in-table")
  1434. else:
  1435. assert self.parser.innerHTML
  1436. # Stop parsing
  1437. def processSpaceCharacters(self, token):
  1438. originalPhase = self.parser.phase
  1439. self.parser.phase = self.parser.phases["inTableText"]
  1440. self.parser.phase.originalPhase = originalPhase
  1441. self.parser.phase.processSpaceCharacters(token)
  1442. def processCharacters(self, token):
  1443. originalPhase = self.parser.phase
  1444. self.parser.phase = self.parser.phases["inTableText"]
  1445. self.parser.phase.originalPhase = originalPhase
  1446. self.parser.phase.processCharacters(token)
  1447. def insertText(self, token):
  1448. # If we get here there must be at least one non-whitespace character
  1449. # Do the table magic!
  1450. self.tree.insertFromTable = True
  1451. self.parser.phases["inBody"].processCharacters(token)
  1452. self.tree.insertFromTable = False
  1453. def startTagCaption(self, token):
  1454. self.clearStackToTableContext()
  1455. self.tree.activeFormattingElements.append(Marker)
  1456. self.tree.insertElement(token)
  1457. self.parser.phase = self.parser.phases["inCaption"]
  1458. def startTagColgroup(self, token):
  1459. self.clearStackToTableContext()
  1460. self.tree.insertElement(token)
  1461. self.parser.phase = self.parser.phases["inColumnGroup"]
  1462. def startTagCol(self, token):
  1463. self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
  1464. return token
  1465. def startTagRowGroup(self, token):
  1466. self.clearStackToTableContext()
  1467. self.tree.insertElement(token)
  1468. self.parser.phase = self.parser.phases["inTableBody"]
  1469. def startTagImplyTbody(self, token):
  1470. self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
  1471. return token
  1472. def startTagTable(self, token):
  1473. self.parser.parseError("unexpected-start-tag-implies-end-tag",
  1474. {"startName": "table", "endName": "table"})
  1475. self.parser.phase.processEndTag(impliedTagToken("table"))
  1476. if not self.parser.innerHTML:
  1477. return token
  1478. def startTagStyleScript(self, token):
  1479. return self.parser.phases["inHead"].processStartTag(token)
  1480. def startTagInput(self, token):
  1481. if ("type" in token["data"] and
  1482. token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
  1483. self.parser.parseError("unexpected-hidden-input-in-table")
  1484. self.tree.insertElement(token)
  1485. # XXX associate with form
  1486. self.tree.openElements.pop()
  1487. else:
  1488. self.startTagOther(token)
  1489. def startTagForm(self, token):
  1490. self.parser.parseError("unexpected-form-in-table")
  1491. if self.tree.formPointer is None:
  1492. self.tree.insertElement(token)
  1493. self.tree.formPointer = self.tree.openElements[-1]
  1494. self.tree.openElements.pop()
  1495. def startTagOther(self, token):
  1496. self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
  1497. # Do the table magic!
  1498. self.tree.insertFromTable = True
  1499. self.parser.phases["inBody"].processStartTag(token)
  1500. self.tree.insertFromTable = False
  1501. def endTagTable(self, token):
  1502. if self.tree.elementInScope("table", variant="table"):
  1503. self.tree.generateImpliedEndTags()
  1504. if self.tree.openElements[-1].name != "table":
  1505. self.parser.parseError("end-tag-too-early-named",
  1506. {"gotName": "table",
  1507. "expectedName": self.tree.openElements[-1].name})
  1508. while self.tree.openElements[-1].name != "table":
  1509. self.tree.openElements.pop()
  1510. self.tree.openElements.pop()
  1511. self.parser.resetInsertionMode()
  1512. else:
  1513. # innerHTML case
  1514. assert self.parser.innerHTML
  1515. self.parser.parseError()
  1516. def endTagIgnore(self, token):
  1517. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1518. def endTagOther(self, token):
  1519. self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
  1520. # Do the table magic!
  1521. self.tree.insertFromTable = True
  1522. self.parser.phases["inBody"].processEndTag(token)
  1523. self.tree.insertFromTable = False
  1524. startTagHandler = _utils.MethodDispatcher([
  1525. ("html", Phase.startTagHtml),
  1526. ("caption", startTagCaption),
  1527. ("colgroup", startTagColgroup),
  1528. ("col", startTagCol),
  1529. (("tbody", "tfoot", "thead"), startTagRowGroup),
  1530. (("td", "th", "tr"), startTagImplyTbody),
  1531. ("table", startTagTable),
  1532. (("style", "script"), startTagStyleScript),
  1533. ("input", startTagInput),
  1534. ("form", startTagForm)
  1535. ])
  1536. startTagHandler.default = startTagOther
  1537. endTagHandler = _utils.MethodDispatcher([
  1538. ("table", endTagTable),
  1539. (("body", "caption", "col", "colgroup", "html", "tbody", "td",
  1540. "tfoot", "th", "thead", "tr"), endTagIgnore)
  1541. ])
  1542. endTagHandler.default = endTagOther
  1543. class InTableTextPhase(Phase):
  1544. __slots__ = ("originalPhase", "characterTokens")
  1545. def __init__(self, *args, **kwargs):
  1546. super(InTableTextPhase, self).__init__(*args, **kwargs)
  1547. self.originalPhase = None
  1548. self.characterTokens = []
  1549. def flushCharacters(self):
  1550. data = "".join([item["data"] for item in self.characterTokens])
  1551. if any([item not in spaceCharacters for item in data]):
  1552. token = {"type": tokenTypes["Characters"], "data": data}
  1553. self.parser.phases["inTable"].insertText(token)
  1554. elif data:
  1555. self.tree.insertText(data)
  1556. self.characterTokens = []
  1557. def processComment(self, token):
  1558. self.flushCharacters()
  1559. self.parser.phase = self.originalPhase
  1560. return token
  1561. def processEOF(self):
  1562. self.flushCharacters()
  1563. self.parser.phase = self.originalPhase
  1564. return True
  1565. def processCharacters(self, token):
  1566. if token["data"] == "\u0000":
  1567. return
  1568. self.characterTokens.append(token)
  1569. def processSpaceCharacters(self, token):
  1570. # pretty sure we should never reach here
  1571. self.characterTokens.append(token)
  1572. # assert False
  1573. def processStartTag(self, token):
  1574. self.flushCharacters()
  1575. self.parser.phase = self.originalPhase
  1576. return token
  1577. def processEndTag(self, token):
  1578. self.flushCharacters()
  1579. self.parser.phase = self.originalPhase
  1580. return token
  1581. class InCaptionPhase(Phase):
  1582. # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
  1583. __slots__ = tuple()
  1584. def ignoreEndTagCaption(self):
  1585. return not self.tree.elementInScope("caption", variant="table")
  1586. def processEOF(self):
  1587. self.parser.phases["inBody"].processEOF()
  1588. def processCharacters(self, token):
  1589. return self.parser.phases["inBody"].processCharacters(token)
  1590. def startTagTableElement(self, token):
  1591. self.parser.parseError()
  1592. # XXX Have to duplicate logic here to find out if the tag is ignored
  1593. ignoreEndTag = self.ignoreEndTagCaption()
  1594. self.parser.phase.processEndTag(impliedTagToken("caption"))
  1595. if not ignoreEndTag:
  1596. return token
  1597. def startTagOther(self, token):
  1598. return self.parser.phases["inBody"].processStartTag(token)
  1599. def endTagCaption(self, token):
  1600. if not self.ignoreEndTagCaption():
  1601. # AT this code is quite similar to endTagTable in "InTable"
  1602. self.tree.generateImpliedEndTags()
  1603. if self.tree.openElements[-1].name != "caption":
  1604. self.parser.parseError("expected-one-end-tag-but-got-another",
  1605. {"gotName": "caption",
  1606. "expectedName": self.tree.openElements[-1].name})
  1607. while self.tree.openElements[-1].name != "caption":
  1608. self.tree.openElements.pop()
  1609. self.tree.openElements.pop()
  1610. self.tree.clearActiveFormattingElements()
  1611. self.parser.phase = self.parser.phases["inTable"]
  1612. else:
  1613. # innerHTML case
  1614. assert self.parser.innerHTML
  1615. self.parser.parseError()
  1616. def endTagTable(self, token):
  1617. self.parser.parseError()
  1618. ignoreEndTag = self.ignoreEndTagCaption()
  1619. self.parser.phase.processEndTag(impliedTagToken("caption"))
  1620. if not ignoreEndTag:
  1621. return token
  1622. def endTagIgnore(self, token):
  1623. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1624. def endTagOther(self, token):
  1625. return self.parser.phases["inBody"].processEndTag(token)
  1626. startTagHandler = _utils.MethodDispatcher([
  1627. ("html", Phase.startTagHtml),
  1628. (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
  1629. "thead", "tr"), startTagTableElement)
  1630. ])
  1631. startTagHandler.default = startTagOther
  1632. endTagHandler = _utils.MethodDispatcher([
  1633. ("caption", endTagCaption),
  1634. ("table", endTagTable),
  1635. (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
  1636. "thead", "tr"), endTagIgnore)
  1637. ])
  1638. endTagHandler.default = endTagOther
  1639. class InColumnGroupPhase(Phase):
  1640. # http://www.whatwg.org/specs/web-apps/current-work/#in-column
  1641. __slots__ = tuple()
  1642. def ignoreEndTagColgroup(self):
  1643. return self.tree.openElements[-1].name == "html"
  1644. def processEOF(self):
  1645. if self.tree.openElements[-1].name == "html":
  1646. assert self.parser.innerHTML
  1647. return
  1648. else:
  1649. ignoreEndTag = self.ignoreEndTagColgroup()
  1650. self.endTagColgroup(impliedTagToken("colgroup"))
  1651. if not ignoreEndTag:
  1652. return True
  1653. def processCharacters(self, token):
  1654. ignoreEndTag = self.ignoreEndTagColgroup()
  1655. self.endTagColgroup(impliedTagToken("colgroup"))
  1656. if not ignoreEndTag:
  1657. return token
  1658. def startTagCol(self, token):
  1659. self.tree.insertElement(token)
  1660. self.tree.openElements.pop()
  1661. token["selfClosingAcknowledged"] = True
  1662. def startTagOther(self, token):
  1663. ignoreEndTag = self.ignoreEndTagColgroup()
  1664. self.endTagColgroup(impliedTagToken("colgroup"))
  1665. if not ignoreEndTag:
  1666. return token
  1667. def endTagColgroup(self, token):
  1668. if self.ignoreEndTagColgroup():
  1669. # innerHTML case
  1670. assert self.parser.innerHTML
  1671. self.parser.parseError()
  1672. else:
  1673. self.tree.openElements.pop()
  1674. self.parser.phase = self.parser.phases["inTable"]
  1675. def endTagCol(self, token):
  1676. self.parser.parseError("no-end-tag", {"name": "col"})
  1677. def endTagOther(self, token):
  1678. ignoreEndTag = self.ignoreEndTagColgroup()
  1679. self.endTagColgroup(impliedTagToken("colgroup"))
  1680. if not ignoreEndTag:
  1681. return token
  1682. startTagHandler = _utils.MethodDispatcher([
  1683. ("html", Phase.startTagHtml),
  1684. ("col", startTagCol)
  1685. ])
  1686. startTagHandler.default = startTagOther
  1687. endTagHandler = _utils.MethodDispatcher([
  1688. ("colgroup", endTagColgroup),
  1689. ("col", endTagCol)
  1690. ])
  1691. endTagHandler.default = endTagOther
  1692. class InTableBodyPhase(Phase):
  1693. # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
  1694. __slots__ = tuple()
  1695. # helper methods
  1696. def clearStackToTableBodyContext(self):
  1697. while self.tree.openElements[-1].name not in ("tbody", "tfoot",
  1698. "thead", "html"):
  1699. # self.parser.parseError("unexpected-implied-end-tag-in-table",
  1700. # {"name": self.tree.openElements[-1].name})
  1701. self.tree.openElements.pop()
  1702. if self.tree.openElements[-1].name == "html":
  1703. assert self.parser.innerHTML
  1704. # the rest
  1705. def processEOF(self):
  1706. self.parser.phases["inTable"].processEOF()
  1707. def processSpaceCharacters(self, token):
  1708. return self.parser.phases["inTable"].processSpaceCharacters(token)
  1709. def processCharacters(self, token):
  1710. return self.parser.phases["inTable"].processCharacters(token)
  1711. def startTagTr(self, token):
  1712. self.clearStackToTableBodyContext()
  1713. self.tree.insertElement(token)
  1714. self.parser.phase = self.parser.phases["inRow"]
  1715. def startTagTableCell(self, token):
  1716. self.parser.parseError("unexpected-cell-in-table-body",
  1717. {"name": token["name"]})
  1718. self.startTagTr(impliedTagToken("tr", "StartTag"))
  1719. return token
  1720. def startTagTableOther(self, token):
  1721. # XXX AT Any ideas on how to share this with endTagTable?
  1722. if (self.tree.elementInScope("tbody", variant="table") or
  1723. self.tree.elementInScope("thead", variant="table") or
  1724. self.tree.elementInScope("tfoot", variant="table")):
  1725. self.clearStackToTableBodyContext()
  1726. self.endTagTableRowGroup(
  1727. impliedTagToken(self.tree.openElements[-1].name))
  1728. return token
  1729. else:
  1730. # innerHTML case
  1731. assert self.parser.innerHTML
  1732. self.parser.parseError()
  1733. def startTagOther(self, token):
  1734. return self.parser.phases["inTable"].processStartTag(token)
  1735. def endTagTableRowGroup(self, token):
  1736. if self.tree.elementInScope(token["name"], variant="table"):
  1737. self.clearStackToTableBodyContext()
  1738. self.tree.openElements.pop()
  1739. self.parser.phase = self.parser.phases["inTable"]
  1740. else:
  1741. self.parser.parseError("unexpected-end-tag-in-table-body",
  1742. {"name": token["name"]})
  1743. def endTagTable(self, token):
  1744. if (self.tree.elementInScope("tbody", variant="table") or
  1745. self.tree.elementInScope("thead", variant="table") or
  1746. self.tree.elementInScope("tfoot", variant="table")):
  1747. self.clearStackToTableBodyContext()
  1748. self.endTagTableRowGroup(
  1749. impliedTagToken(self.tree.openElements[-1].name))
  1750. return token
  1751. else:
  1752. # innerHTML case
  1753. assert self.parser.innerHTML
  1754. self.parser.parseError()
  1755. def endTagIgnore(self, token):
  1756. self.parser.parseError("unexpected-end-tag-in-table-body",
  1757. {"name": token["name"]})
  1758. def endTagOther(self, token):
  1759. return self.parser.phases["inTable"].processEndTag(token)
  1760. startTagHandler = _utils.MethodDispatcher([
  1761. ("html", Phase.startTagHtml),
  1762. ("tr", startTagTr),
  1763. (("td", "th"), startTagTableCell),
  1764. (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
  1765. startTagTableOther)
  1766. ])
  1767. startTagHandler.default = startTagOther
  1768. endTagHandler = _utils.MethodDispatcher([
  1769. (("tbody", "tfoot", "thead"), endTagTableRowGroup),
  1770. ("table", endTagTable),
  1771. (("body", "caption", "col", "colgroup", "html", "td", "th",
  1772. "tr"), endTagIgnore)
  1773. ])
  1774. endTagHandler.default = endTagOther
  1775. class InRowPhase(Phase):
  1776. # http://www.whatwg.org/specs/web-apps/current-work/#in-row
  1777. __slots__ = tuple()
  1778. # helper methods (XXX unify this with other table helper methods)
  1779. def clearStackToTableRowContext(self):
  1780. while self.tree.openElements[-1].name not in ("tr", "html"):
  1781. self.parser.parseError("unexpected-implied-end-tag-in-table-row",
  1782. {"name": self.tree.openElements[-1].name})
  1783. self.tree.openElements.pop()
  1784. def ignoreEndTagTr(self):
  1785. return not self.tree.elementInScope("tr", variant="table")
  1786. # the rest
  1787. def processEOF(self):
  1788. self.parser.phases["inTable"].processEOF()
  1789. def processSpaceCharacters(self, token):
  1790. return self.parser.phases["inTable"].processSpaceCharacters(token)
  1791. def processCharacters(self, token):
  1792. return self.parser.phases["inTable"].processCharacters(token)
  1793. def startTagTableCell(self, token):
  1794. self.clearStackToTableRowContext()
  1795. self.tree.insertElement(token)
  1796. self.parser.phase = self.parser.phases["inCell"]
  1797. self.tree.activeFormattingElements.append(Marker)
  1798. def startTagTableOther(self, token):
  1799. ignoreEndTag = self.ignoreEndTagTr()
  1800. self.endTagTr(impliedTagToken("tr"))
  1801. # XXX how are we sure it's always ignored in the innerHTML case?
  1802. if not ignoreEndTag:
  1803. return token
  1804. def startTagOther(self, token):
  1805. return self.parser.phases["inTable"].processStartTag(token)
  1806. def endTagTr(self, token):
  1807. if not self.ignoreEndTagTr():
  1808. self.clearStackToTableRowContext()
  1809. self.tree.openElements.pop()
  1810. self.parser.phase = self.parser.phases["inTableBody"]
  1811. else:
  1812. # innerHTML case
  1813. assert self.parser.innerHTML
  1814. self.parser.parseError()
  1815. def endTagTable(self, token):
  1816. ignoreEndTag = self.ignoreEndTagTr()
  1817. self.endTagTr(impliedTagToken("tr"))
  1818. # Reprocess the current tag if the tr end tag was not ignored
  1819. # XXX how are we sure it's always ignored in the innerHTML case?
  1820. if not ignoreEndTag:
  1821. return token
  1822. def endTagTableRowGroup(self, token):
  1823. if self.tree.elementInScope(token["name"], variant="table"):
  1824. self.endTagTr(impliedTagToken("tr"))
  1825. return token
  1826. else:
  1827. self.parser.parseError()
  1828. def endTagIgnore(self, token):
  1829. self.parser.parseError("unexpected-end-tag-in-table-row",
  1830. {"name": token["name"]})
  1831. def endTagOther(self, token):
  1832. return self.parser.phases["inTable"].processEndTag(token)
  1833. startTagHandler = _utils.MethodDispatcher([
  1834. ("html", Phase.startTagHtml),
  1835. (("td", "th"), startTagTableCell),
  1836. (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
  1837. "tr"), startTagTableOther)
  1838. ])
  1839. startTagHandler.default = startTagOther
  1840. endTagHandler = _utils.MethodDispatcher([
  1841. ("tr", endTagTr),
  1842. ("table", endTagTable),
  1843. (("tbody", "tfoot", "thead"), endTagTableRowGroup),
  1844. (("body", "caption", "col", "colgroup", "html", "td", "th"),
  1845. endTagIgnore)
  1846. ])
  1847. endTagHandler.default = endTagOther
  1848. class InCellPhase(Phase):
  1849. # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
  1850. __slots__ = tuple()
  1851. # helper
  1852. def closeCell(self):
  1853. if self.tree.elementInScope("td", variant="table"):
  1854. self.endTagTableCell(impliedTagToken("td"))
  1855. elif self.tree.elementInScope("th", variant="table"):
  1856. self.endTagTableCell(impliedTagToken("th"))
  1857. # the rest
  1858. def processEOF(self):
  1859. self.parser.phases["inBody"].processEOF()
  1860. def processCharacters(self, token):
  1861. return self.parser.phases["inBody"].processCharacters(token)
  1862. def startTagTableOther(self, token):
  1863. if (self.tree.elementInScope("td", variant="table") or
  1864. self.tree.elementInScope("th", variant="table")):
  1865. self.closeCell()
  1866. return token
  1867. else:
  1868. # innerHTML case
  1869. assert self.parser.innerHTML
  1870. self.parser.parseError()
  1871. def startTagOther(self, token):
  1872. return self.parser.phases["inBody"].processStartTag(token)
  1873. def endTagTableCell(self, token):
  1874. if self.tree.elementInScope(token["name"], variant="table"):
  1875. self.tree.generateImpliedEndTags(token["name"])
  1876. if self.tree.openElements[-1].name != token["name"]:
  1877. self.parser.parseError("unexpected-cell-end-tag",
  1878. {"name": token["name"]})
  1879. while True:
  1880. node = self.tree.openElements.pop()
  1881. if node.name == token["name"]:
  1882. break
  1883. else:
  1884. self.tree.openElements.pop()
  1885. self.tree.clearActiveFormattingElements()
  1886. self.parser.phase = self.parser.phases["inRow"]
  1887. else:
  1888. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1889. def endTagIgnore(self, token):
  1890. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1891. def endTagImply(self, token):
  1892. if self.tree.elementInScope(token["name"], variant="table"):
  1893. self.closeCell()
  1894. return token
  1895. else:
  1896. # sometimes innerHTML case
  1897. self.parser.parseError()
  1898. def endTagOther(self, token):
  1899. return self.parser.phases["inBody"].processEndTag(token)
  1900. startTagHandler = _utils.MethodDispatcher([
  1901. ("html", Phase.startTagHtml),
  1902. (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
  1903. "thead", "tr"), startTagTableOther)
  1904. ])
  1905. startTagHandler.default = startTagOther
  1906. endTagHandler = _utils.MethodDispatcher([
  1907. (("td", "th"), endTagTableCell),
  1908. (("body", "caption", "col", "colgroup", "html"), endTagIgnore),
  1909. (("table", "tbody", "tfoot", "thead", "tr"), endTagImply)
  1910. ])
  1911. endTagHandler.default = endTagOther
  1912. class InSelectPhase(Phase):
  1913. __slots__ = tuple()
  1914. # http://www.whatwg.org/specs/web-apps/current-work/#in-select
  1915. def processEOF(self):
  1916. if self.tree.openElements[-1].name != "html":
  1917. self.parser.parseError("eof-in-select")
  1918. else:
  1919. assert self.parser.innerHTML
  1920. def processCharacters(self, token):
  1921. if token["data"] == "\u0000":
  1922. return
  1923. self.tree.insertText(token["data"])
  1924. def startTagOption(self, token):
  1925. # We need to imply </option> if <option> is the current node.
  1926. if self.tree.openElements[-1].name == "option":
  1927. self.tree.openElements.pop()
  1928. self.tree.insertElement(token)
  1929. def startTagOptgroup(self, token):
  1930. if self.tree.openElements[-1].name == "option":
  1931. self.tree.openElements.pop()
  1932. if self.tree.openElements[-1].name == "optgroup":
  1933. self.tree.openElements.pop()
  1934. self.tree.insertElement(token)
  1935. def startTagSelect(self, token):
  1936. self.parser.parseError("unexpected-select-in-select")
  1937. self.endTagSelect(impliedTagToken("select"))
  1938. def startTagInput(self, token):
  1939. self.parser.parseError("unexpected-input-in-select")
  1940. if self.tree.elementInScope("select", variant="select"):
  1941. self.endTagSelect(impliedTagToken("select"))
  1942. return token
  1943. else:
  1944. assert self.parser.innerHTML
  1945. def startTagScript(self, token):
  1946. return self.parser.phases["inHead"].processStartTag(token)
  1947. def startTagOther(self, token):
  1948. self.parser.parseError("unexpected-start-tag-in-select",
  1949. {"name": token["name"]})
  1950. def endTagOption(self, token):
  1951. if self.tree.openElements[-1].name == "option":
  1952. self.tree.openElements.pop()
  1953. else:
  1954. self.parser.parseError("unexpected-end-tag-in-select",
  1955. {"name": "option"})
  1956. def endTagOptgroup(self, token):
  1957. # </optgroup> implicitly closes <option>
  1958. if (self.tree.openElements[-1].name == "option" and
  1959. self.tree.openElements[-2].name == "optgroup"):
  1960. self.tree.openElements.pop()
  1961. # It also closes </optgroup>
  1962. if self.tree.openElements[-1].name == "optgroup":
  1963. self.tree.openElements.pop()
  1964. # But nothing else
  1965. else:
  1966. self.parser.parseError("unexpected-end-tag-in-select",
  1967. {"name": "optgroup"})
  1968. def endTagSelect(self, token):
  1969. if self.tree.elementInScope("select", variant="select"):
  1970. node = self.tree.openElements.pop()
  1971. while node.name != "select":
  1972. node = self.tree.openElements.pop()
  1973. self.parser.resetInsertionMode()
  1974. else:
  1975. # innerHTML case
  1976. assert self.parser.innerHTML
  1977. self.parser.parseError()
  1978. def endTagOther(self, token):
  1979. self.parser.parseError("unexpected-end-tag-in-select",
  1980. {"name": token["name"]})
  1981. startTagHandler = _utils.MethodDispatcher([
  1982. ("html", Phase.startTagHtml),
  1983. ("option", startTagOption),
  1984. ("optgroup", startTagOptgroup),
  1985. ("select", startTagSelect),
  1986. (("input", "keygen", "textarea"), startTagInput),
  1987. ("script", startTagScript)
  1988. ])
  1989. startTagHandler.default = startTagOther
  1990. endTagHandler = _utils.MethodDispatcher([
  1991. ("option", endTagOption),
  1992. ("optgroup", endTagOptgroup),
  1993. ("select", endTagSelect)
  1994. ])
  1995. endTagHandler.default = endTagOther
  1996. class InSelectInTablePhase(Phase):
  1997. __slots__ = tuple()
  1998. def processEOF(self):
  1999. self.parser.phases["inSelect"].processEOF()
  2000. def processCharacters(self, token):
  2001. return self.parser.phases["inSelect"].processCharacters(token)
  2002. def startTagTable(self, token):
  2003. self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
  2004. self.endTagOther(impliedTagToken("select"))
  2005. return token
  2006. def startTagOther(self, token):
  2007. return self.parser.phases["inSelect"].processStartTag(token)
  2008. def endTagTable(self, token):
  2009. self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
  2010. if self.tree.elementInScope(token["name"], variant="table"):
  2011. self.endTagOther(impliedTagToken("select"))
  2012. return token
  2013. def endTagOther(self, token):
  2014. return self.parser.phases["inSelect"].processEndTag(token)
  2015. startTagHandler = _utils.MethodDispatcher([
  2016. (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
  2017. startTagTable)
  2018. ])
  2019. startTagHandler.default = startTagOther
  2020. endTagHandler = _utils.MethodDispatcher([
  2021. (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
  2022. endTagTable)
  2023. ])
  2024. endTagHandler.default = endTagOther
  2025. class InForeignContentPhase(Phase):
  2026. __slots__ = tuple()
  2027. breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
  2028. "center", "code", "dd", "div", "dl", "dt",
  2029. "em", "embed", "h1", "h2", "h3",
  2030. "h4", "h5", "h6", "head", "hr", "i", "img",
  2031. "li", "listing", "menu", "meta", "nobr",
  2032. "ol", "p", "pre", "ruby", "s", "small",
  2033. "span", "strong", "strike", "sub", "sup",
  2034. "table", "tt", "u", "ul", "var"])
  2035. def adjustSVGTagNames(self, token):
  2036. replacements = {"altglyph": "altGlyph",
  2037. "altglyphdef": "altGlyphDef",
  2038. "altglyphitem": "altGlyphItem",
  2039. "animatecolor": "animateColor",
  2040. "animatemotion": "animateMotion",
  2041. "animatetransform": "animateTransform",
  2042. "clippath": "clipPath",
  2043. "feblend": "feBlend",
  2044. "fecolormatrix": "feColorMatrix",
  2045. "fecomponenttransfer": "feComponentTransfer",
  2046. "fecomposite": "feComposite",
  2047. "feconvolvematrix": "feConvolveMatrix",
  2048. "fediffuselighting": "feDiffuseLighting",
  2049. "fedisplacementmap": "feDisplacementMap",
  2050. "fedistantlight": "feDistantLight",
  2051. "feflood": "feFlood",
  2052. "fefunca": "feFuncA",
  2053. "fefuncb": "feFuncB",
  2054. "fefuncg": "feFuncG",
  2055. "fefuncr": "feFuncR",
  2056. "fegaussianblur": "feGaussianBlur",
  2057. "feimage": "feImage",
  2058. "femerge": "feMerge",
  2059. "femergenode": "feMergeNode",
  2060. "femorphology": "feMorphology",
  2061. "feoffset": "feOffset",
  2062. "fepointlight": "fePointLight",
  2063. "fespecularlighting": "feSpecularLighting",
  2064. "fespotlight": "feSpotLight",
  2065. "fetile": "feTile",
  2066. "feturbulence": "feTurbulence",
  2067. "foreignobject": "foreignObject",
  2068. "glyphref": "glyphRef",
  2069. "lineargradient": "linearGradient",
  2070. "radialgradient": "radialGradient",
  2071. "textpath": "textPath"}
  2072. if token["name"] in replacements:
  2073. token["name"] = replacements[token["name"]]
  2074. def processCharacters(self, token):
  2075. if token["data"] == "\u0000":
  2076. token["data"] = "\uFFFD"
  2077. elif (self.parser.framesetOK and
  2078. any(char not in spaceCharacters for char in token["data"])):
  2079. self.parser.framesetOK = False
  2080. Phase.processCharacters(self, token)
  2081. def processStartTag(self, token):
  2082. currentNode = self.tree.openElements[-1]
  2083. if (token["name"] in self.breakoutElements or
  2084. (token["name"] == "font" and
  2085. set(token["data"].keys()) & {"color", "face", "size"})):
  2086. self.parser.parseError("unexpected-html-element-in-foreign-content",
  2087. {"name": token["name"]})
  2088. while (self.tree.openElements[-1].namespace !=
  2089. self.tree.defaultNamespace and
  2090. not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
  2091. not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
  2092. self.tree.openElements.pop()
  2093. return token
  2094. else:
  2095. if currentNode.namespace == namespaces["mathml"]:
  2096. self.parser.adjustMathMLAttributes(token)
  2097. elif currentNode.namespace == namespaces["svg"]:
  2098. self.adjustSVGTagNames(token)
  2099. self.parser.adjustSVGAttributes(token)
  2100. self.parser.adjustForeignAttributes(token)
  2101. token["namespace"] = currentNode.namespace
  2102. self.tree.insertElement(token)
  2103. if token["selfClosing"]:
  2104. self.tree.openElements.pop()
  2105. token["selfClosingAcknowledged"] = True
  2106. def processEndTag(self, token):
  2107. nodeIndex = len(self.tree.openElements) - 1
  2108. node = self.tree.openElements[-1]
  2109. if node.name.translate(asciiUpper2Lower) != token["name"]:
  2110. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  2111. while True:
  2112. if node.name.translate(asciiUpper2Lower) == token["name"]:
  2113. # XXX this isn't in the spec but it seems necessary
  2114. if self.parser.phase == self.parser.phases["inTableText"]:
  2115. self.parser.phase.flushCharacters()
  2116. self.parser.phase = self.parser.phase.originalPhase
  2117. while self.tree.openElements.pop() != node:
  2118. assert self.tree.openElements
  2119. new_token = None
  2120. break
  2121. nodeIndex -= 1
  2122. node = self.tree.openElements[nodeIndex]
  2123. if node.namespace != self.tree.defaultNamespace:
  2124. continue
  2125. else:
  2126. new_token = self.parser.phase.processEndTag(token)
  2127. break
  2128. return new_token
  2129. class AfterBodyPhase(Phase):
  2130. __slots__ = tuple()
  2131. def processEOF(self):
  2132. # Stop parsing
  2133. pass
  2134. def processComment(self, token):
  2135. # This is needed because data is to be appended to the <html> element
  2136. # here and not to whatever is currently open.
  2137. self.tree.insertComment(token, self.tree.openElements[0])
  2138. def processCharacters(self, token):
  2139. self.parser.parseError("unexpected-char-after-body")
  2140. self.parser.phase = self.parser.phases["inBody"]
  2141. return token
  2142. def startTagHtml(self, token):
  2143. return self.parser.phases["inBody"].processStartTag(token)
  2144. def startTagOther(self, token):
  2145. self.parser.parseError("unexpected-start-tag-after-body",
  2146. {"name": token["name"]})
  2147. self.parser.phase = self.parser.phases["inBody"]
  2148. return token
  2149. def endTagHtml(self, name):
  2150. if self.parser.innerHTML:
  2151. self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
  2152. else:
  2153. self.parser.phase = self.parser.phases["afterAfterBody"]
  2154. def endTagOther(self, token):
  2155. self.parser.parseError("unexpected-end-tag-after-body",
  2156. {"name": token["name"]})
  2157. self.parser.phase = self.parser.phases["inBody"]
  2158. return token
  2159. startTagHandler = _utils.MethodDispatcher([
  2160. ("html", startTagHtml)
  2161. ])
  2162. startTagHandler.default = startTagOther
  2163. endTagHandler = _utils.MethodDispatcher([("html", endTagHtml)])
  2164. endTagHandler.default = endTagOther
  2165. class InFramesetPhase(Phase):
  2166. # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
  2167. __slots__ = tuple()
  2168. def processEOF(self):
  2169. if self.tree.openElements[-1].name != "html":
  2170. self.parser.parseError("eof-in-frameset")
  2171. else:
  2172. assert self.parser.innerHTML
  2173. def processCharacters(self, token):
  2174. self.parser.parseError("unexpected-char-in-frameset")
  2175. def startTagFrameset(self, token):
  2176. self.tree.insertElement(token)
  2177. def startTagFrame(self, token):
  2178. self.tree.insertElement(token)
  2179. self.tree.openElements.pop()
  2180. def startTagNoframes(self, token):
  2181. return self.parser.phases["inBody"].processStartTag(token)
  2182. def startTagOther(self, token):
  2183. self.parser.parseError("unexpected-start-tag-in-frameset",
  2184. {"name": token["name"]})
  2185. def endTagFrameset(self, token):
  2186. if self.tree.openElements[-1].name == "html":
  2187. # innerHTML case
  2188. self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
  2189. else:
  2190. self.tree.openElements.pop()
  2191. if (not self.parser.innerHTML and
  2192. self.tree.openElements[-1].name != "frameset"):
  2193. # If we're not in innerHTML mode and the current node is not a
  2194. # "frameset" element (anymore) then switch.
  2195. self.parser.phase = self.parser.phases["afterFrameset"]
  2196. def endTagOther(self, token):
  2197. self.parser.parseError("unexpected-end-tag-in-frameset",
  2198. {"name": token["name"]})
  2199. startTagHandler = _utils.MethodDispatcher([
  2200. ("html", Phase.startTagHtml),
  2201. ("frameset", startTagFrameset),
  2202. ("frame", startTagFrame),
  2203. ("noframes", startTagNoframes)
  2204. ])
  2205. startTagHandler.default = startTagOther
  2206. endTagHandler = _utils.MethodDispatcher([
  2207. ("frameset", endTagFrameset)
  2208. ])
  2209. endTagHandler.default = endTagOther
  2210. class AfterFramesetPhase(Phase):
  2211. # http://www.whatwg.org/specs/web-apps/current-work/#after3
  2212. __slots__ = tuple()
  2213. def processEOF(self):
  2214. # Stop parsing
  2215. pass
  2216. def processCharacters(self, token):
  2217. self.parser.parseError("unexpected-char-after-frameset")
  2218. def startTagNoframes(self, token):
  2219. return self.parser.phases["inHead"].processStartTag(token)
  2220. def startTagOther(self, token):
  2221. self.parser.parseError("unexpected-start-tag-after-frameset",
  2222. {"name": token["name"]})
  2223. def endTagHtml(self, token):
  2224. self.parser.phase = self.parser.phases["afterAfterFrameset"]
  2225. def endTagOther(self, token):
  2226. self.parser.parseError("unexpected-end-tag-after-frameset",
  2227. {"name": token["name"]})
  2228. startTagHandler = _utils.MethodDispatcher([
  2229. ("html", Phase.startTagHtml),
  2230. ("noframes", startTagNoframes)
  2231. ])
  2232. startTagHandler.default = startTagOther
  2233. endTagHandler = _utils.MethodDispatcher([
  2234. ("html", endTagHtml)
  2235. ])
  2236. endTagHandler.default = endTagOther
  2237. class AfterAfterBodyPhase(Phase):
  2238. __slots__ = tuple()
  2239. def processEOF(self):
  2240. pass
  2241. def processComment(self, token):
  2242. self.tree.insertComment(token, self.tree.document)
  2243. def processSpaceCharacters(self, token):
  2244. return self.parser.phases["inBody"].processSpaceCharacters(token)
  2245. def processCharacters(self, token):
  2246. self.parser.parseError("expected-eof-but-got-char")
  2247. self.parser.phase = self.parser.phases["inBody"]
  2248. return token
  2249. def startTagHtml(self, token):
  2250. return self.parser.phases["inBody"].processStartTag(token)
  2251. def startTagOther(self, token):
  2252. self.parser.parseError("expected-eof-but-got-start-tag",
  2253. {"name": token["name"]})
  2254. self.parser.phase = self.parser.phases["inBody"]
  2255. return token
  2256. def processEndTag(self, token):
  2257. self.parser.parseError("expected-eof-but-got-end-tag",
  2258. {"name": token["name"]})
  2259. self.parser.phase = self.parser.phases["inBody"]
  2260. return token
  2261. startTagHandler = _utils.MethodDispatcher([
  2262. ("html", startTagHtml)
  2263. ])
  2264. startTagHandler.default = startTagOther
  2265. class AfterAfterFramesetPhase(Phase):
  2266. __slots__ = tuple()
  2267. def processEOF(self):
  2268. pass
  2269. def processComment(self, token):
  2270. self.tree.insertComment(token, self.tree.document)
  2271. def processSpaceCharacters(self, token):
  2272. return self.parser.phases["inBody"].processSpaceCharacters(token)
  2273. def processCharacters(self, token):
  2274. self.parser.parseError("expected-eof-but-got-char")
  2275. def startTagHtml(self, token):
  2276. return self.parser.phases["inBody"].processStartTag(token)
  2277. def startTagNoFrames(self, token):
  2278. return self.parser.phases["inHead"].processStartTag(token)
  2279. def startTagOther(self, token):
  2280. self.parser.parseError("expected-eof-but-got-start-tag",
  2281. {"name": token["name"]})
  2282. def processEndTag(self, token):
  2283. self.parser.parseError("expected-eof-but-got-end-tag",
  2284. {"name": token["name"]})
  2285. startTagHandler = _utils.MethodDispatcher([
  2286. ("html", startTagHtml),
  2287. ("noframes", startTagNoFrames)
  2288. ])
  2289. startTagHandler.default = startTagOther
  2290. # pylint:enable=unused-argument
  2291. return {
  2292. "initial": InitialPhase,
  2293. "beforeHtml": BeforeHtmlPhase,
  2294. "beforeHead": BeforeHeadPhase,
  2295. "inHead": InHeadPhase,
  2296. "inHeadNoscript": InHeadNoscriptPhase,
  2297. "afterHead": AfterHeadPhase,
  2298. "inBody": InBodyPhase,
  2299. "text": TextPhase,
  2300. "inTable": InTablePhase,
  2301. "inTableText": InTableTextPhase,
  2302. "inCaption": InCaptionPhase,
  2303. "inColumnGroup": InColumnGroupPhase,
  2304. "inTableBody": InTableBodyPhase,
  2305. "inRow": InRowPhase,
  2306. "inCell": InCellPhase,
  2307. "inSelect": InSelectPhase,
  2308. "inSelectInTable": InSelectInTablePhase,
  2309. "inForeignContent": InForeignContentPhase,
  2310. "afterBody": AfterBodyPhase,
  2311. "inFrameset": InFramesetPhase,
  2312. "afterFrameset": AfterFramesetPhase,
  2313. "afterAfterBody": AfterAfterBodyPhase,
  2314. "afterAfterFrameset": AfterAfterFramesetPhase,
  2315. # XXX after after frameset
  2316. }
  2317. def adjust_attributes(token, replacements):
  2318. needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
  2319. if needs_adjustment:
  2320. token['data'] = type(token['data'])((replacements.get(k, k), v)
  2321. for k, v in token['data'].items())
  2322. def impliedTagToken(name, type="EndTag", attributes=None,
  2323. selfClosing=False):
  2324. if attributes is None:
  2325. attributes = {}
  2326. return {"type": tokenTypes[type], "name": name, "data": attributes,
  2327. "selfClosing": selfClosing}
  2328. class ParseError(Exception):
  2329. """Error in parsed document"""
  2330. pass